Source code for gatenlp.lib_stanza

"""
Support for using stanford stanza (see https://stanfordnlp.github.io/stanza/):
convert from stanford Stanza output to gatenlp documents and annotations.
"""
from gatenlp import Document


[docs]def apply_stanza(nlp, gatenlpdoc, setname=""): """ Run the stanford stanza pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place. :param nlp: StanfordNLP pipeline :param gatenlpdoc: gatenlp document :param setname: set to use :return: """ doc = nlp(gatenlpdoc.text) return stanza2gatenlp(doc, gatenlpdoc=gatenlpdoc, setname=setname)
[docs]def tok2tok(tok): """ Create a copy of a Stanza token, prepared for creating an annotation: this is a dict that has start, end and id keys and everything else in a nested dict "fm". :param tok: original stanza token :return: what we use to create a Token annotation """ newtok = {} newtok["id"] = tok["id"] fm = {} fm.update(tok) newtok["fm"] = fm feats = fm.get("feats") if feats is not None: del fm["feats"] for feat in feats.split("|"): k, v = feat.split("=") fm[k] = v misc = fm.get("misc") if misc is not None: del fm["misc"] msettings = misc.split("|") ostart = None oend = None othersettings = [] for ms in msettings: k, v = ms.split("=") if k == "start_char": ostart = int(v) elif k == "end_char": oend = int(v) else: othersettings.append(ms) if ostart is not None: newtok["start"] = ostart if oend is not None: newtok["end"] = oend if othersettings: for os in othersettings: k, v = ms.split("=") fm[k] = v return newtok
[docs]def stanza2gatenlp(stanzadoc, gatenlpdoc=None, setname="", token_type="Token", sentence_type="Sentence", add_entities=True, ent_prefix=None, ): """ Convert a Stanford Stanza document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the Stanford Stanza document to it. In this case the original gatenlpdoc is used and gets modified. :param stanzadoc: a Stanford Stanza document :param gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. :param setname: the annotation set name to which the annotations get added, empty string for the default annotation set. :param token_type: the annotation type to use for tokens, if needed :param sentence_type: the annotation type to use for sentence anntoations :param add_entities: if True, add any entities as well :param ent_prefix: if None, use the original entity type as annotation type, otherwise add the given string to the annotation type as a prefix. :return: the new or modified gatenlp document """ if gatenlpdoc is None: retdoc = Document(stanzadoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.get_annotations(setname) # stanford nlp processes text in sentence chunks, so we do everything per sentence notmatchedidx = 0 for sent in stanzadoc.sentences: # go through the tokens: in stanza, each token is a list of dicts, normally there is one dict # which also has the offset information in "misc", but for multiword tokens, there seems to be # one "header" dict for the range of words which has the offset info and NER label and then # one additional element per word which has all the rest. # For our purposes we create a list of dicts where for normal tokens we just copy the element, but for # multiword tokens we copy over something that has fake offsets and all the features newtokens = [] for t in sent.tokens: t = t.to_dict() if len(t) == 1: newtokens.append(tok2tok(t[0])) else: tokinfo = tok2tok(t[0]) words = t[1:] fm = tokinfo.get("fm") ner = fm.get("ner") text = fm.get("text") start = tokinfo["start"] end = tokinfo["end"] for i, w in enumerate(words): tok = tok2tok(w) tok["fm"]["ner"] = ner tok["fm"]["token_text"] = text os = min(start + i, end-1) tok["start"] = os if i == len(words)-1: tok["end"] = end else: tok["end"] = os+1 newtokens.append(tok) # now go through the new token list and create annotations idx2annid = {} # map stanza word id to annotation id starts = [] ends = [] for t in newtokens: start = t["start"] end = t["end"] stanzaid = t["id"] starts.append(start) ends.append(end) annid = annset.add(start, end, token_type, t["fm"]) idx2annid[stanzaid] = annid # create a sentence annotation from beginning of first word to end of last sentid = annset.add(starts[0], ends[-1], sentence_type) # now replace the head index with the corresponding annid, the head index "0" is # mapped to the sentence annotation idx2annid["0"] = sentid for annid in list(idx2annid.values()): ann = annset.get(annid) hd = ann.get_feature("head") if hd is not None: hd = str(hd) ann.set_feature("head", idx2annid[hd]) # add the entities if add_entities: for e in stanzadoc.entities: if ent_prefix: anntype = ent_prefix + e.type else: anntype = e.type annset.add(e.start_char, e.end_char, anntype) return retdoc