Module gatenlp.lib_stanfordnlp
Support for using stanfordnlp: convert from stanfordnlp output to gatenlp documents and annotations.
Expand source code
"""
Support for using stanfordnlp: convert from stanfordnlp output to gatenlp documents and annotations.
"""
from gatenlp import Document
from gatenlp import utils
def apply_stanfordnlp(nlp, gatenlpdoc, setname=""):
"""Run the stanford nlp pipeline on the gatenlp document and transfer the annotations.
This modifies the gatenlp document in place.
Args:
nlp: StanfordNLP pipeline
gatenlpdoc: gatenlp document
setname: set to use (Default value = "")
Returns:
"""
doc = nlp(gatenlpdoc.text)
return stanfordnlp2gatenlp(doc, gatenlpdoc=gatenlpdoc, setname=setname)
def stanfordnlp2gatenlp(stanfordnlpdoc, gatenlpdoc=None, setname="", word_type="Word",
sentence_type="Sentence"):
"""Convert a StanfordNLP document to a gatenlp document. If a gatenlp document is already
provided, add the annotations from the StanfordNLP document to it. In this case the
original gatenlpdoc is used and gets modified.
Args:
stanfordnlpdoc: a StanfordNLP document
gatenlpdoc: if None, a new gatenlp document is created otherwise this
document is added to. (Default value = None)
setname: the annotation set name to which the annotations get added, empty string
for the default annotation set.
token_type: the annotation type to use for tokens
sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
word_type: (Default value = "Word")
Returns:
the new or modified
"""
if gatenlpdoc is None:
retdoc = Document(stanfordnlpdoc.text)
else:
retdoc = gatenlpdoc
toki2annid = {}
annset = retdoc.annset(setname)
# stanford nlp processes text in sentence chunks, so we do everything per sentence
# NOTE: the stanford elements do not contain any text offsets, so we have to match and find
# them ourselves. for this we keep an index to first character in the text which has not
# been matched yet
notmatchedidx = 0
for sent in stanfordnlpdoc.sentences:
# a sentence is a list of tokens and a list of words. Some tokens consist of several words.
# dependency parsers are over words, so we create Word and Token annotations, but we only
# set the features per Word annotation for now.
offsetinfos = utils.match_substrings(stanfordnlpdoc.text[notmatchedidx:],
sent.words, getstr=lambda x: x.text)
idx2annid = {}
for oinfo in offsetinfos:
word = oinfo[2]
fm = {
"string": word.text,
"lemma": word.lemma,
"upos": word.upos,
"xpos": word.xpos,
"dependency_relation": word.dependency_relation,
"governor": int(word.governor)
}
for feat in word.feats.split("|"):
if feat and feat != "_":
k, v = feat.split("=")
# TODO: maybe try to detect and convert bool/int values
fm["feat_"+k] = v
snlp_idx = int(word.index)
annid = annset.add(oinfo[0]+notmatchedidx, oinfo[1]+notmatchedidx, word_type, fm).id
idx2annid[snlp_idx] = annid
# create a sentence annotation from beginning of first word to end of last
sentid = annset.add(offsetinfos[0][0]+notmatchedidx, offsetinfos[-1][1]+notmatchedidx, sentence_type).id
# now replace the governor index with the corresponding annid, the governor index is
# mapped to the sentence annotation
idx2annid[0] = sentid
for annid in list(idx2annid.values()):
ann = annset.get(annid)
gov = ann.features.get("governor")
if gov is not None:
ann.features["governor"] = idx2annid[gov]
notmatchedidx = offsetinfos[-1][1]+notmatchedidx + 1
return retdoc
Functions
def apply_stanfordnlp(nlp, gatenlpdoc, setname='')
-
Run the stanford nlp pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place.
Args
nlp
- StanfordNLP pipeline
gatenlpdoc
- gatenlp document
setname
- set to use (Default value = "")
Returns:
Expand source code
def apply_stanfordnlp(nlp, gatenlpdoc, setname=""): """Run the stanford nlp pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place. Args: nlp: StanfordNLP pipeline gatenlpdoc: gatenlp document setname: set to use (Default value = "") Returns: """ doc = nlp(gatenlpdoc.text) return stanfordnlp2gatenlp(doc, gatenlpdoc=gatenlpdoc, setname=setname)
def stanfordnlp2gatenlp(stanfordnlpdoc, gatenlpdoc=None, setname='', word_type='Word', sentence_type='Sentence')
-
Convert a StanfordNLP document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the StanfordNLP document to it. In this case the original gatenlpdoc is used and gets modified.
Args
stanfordnlpdoc
- a StanfordNLP document
gatenlpdoc
- if None, a new gatenlp document is created otherwise this
document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") word_type: (Default value = "Word")
Returns
the new or modified
Expand source code
def stanfordnlp2gatenlp(stanfordnlpdoc, gatenlpdoc=None, setname="", word_type="Word", sentence_type="Sentence"): """Convert a StanfordNLP document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the StanfordNLP document to it. In this case the original gatenlpdoc is used and gets modified. Args: stanfordnlpdoc: a StanfordNLP document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") word_type: (Default value = "Word") Returns: the new or modified """ if gatenlpdoc is None: retdoc = Document(stanfordnlpdoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) # stanford nlp processes text in sentence chunks, so we do everything per sentence # NOTE: the stanford elements do not contain any text offsets, so we have to match and find # them ourselves. for this we keep an index to first character in the text which has not # been matched yet notmatchedidx = 0 for sent in stanfordnlpdoc.sentences: # a sentence is a list of tokens and a list of words. Some tokens consist of several words. # dependency parsers are over words, so we create Word and Token annotations, but we only # set the features per Word annotation for now. offsetinfos = utils.match_substrings(stanfordnlpdoc.text[notmatchedidx:], sent.words, getstr=lambda x: x.text) idx2annid = {} for oinfo in offsetinfos: word = oinfo[2] fm = { "string": word.text, "lemma": word.lemma, "upos": word.upos, "xpos": word.xpos, "dependency_relation": word.dependency_relation, "governor": int(word.governor) } for feat in word.feats.split("|"): if feat and feat != "_": k, v = feat.split("=") # TODO: maybe try to detect and convert bool/int values fm["feat_"+k] = v snlp_idx = int(word.index) annid = annset.add(oinfo[0]+notmatchedidx, oinfo[1]+notmatchedidx, word_type, fm).id idx2annid[snlp_idx] = annid # create a sentence annotation from beginning of first word to end of last sentid = annset.add(offsetinfos[0][0]+notmatchedidx, offsetinfos[-1][1]+notmatchedidx, sentence_type).id # now replace the governor index with the corresponding annid, the governor index is # mapped to the sentence annotation idx2annid[0] = sentid for annid in list(idx2annid.values()): ann = annset.get(annid) gov = ann.features.get("governor") if gov is not None: ann.features["governor"] = idx2annid[gov] notmatchedidx = offsetinfos[-1][1]+notmatchedidx + 1 return retdoc