Source code for gatenlp.serialization.default


import io
import json
from msgpack import pack, Unpacker
from gatenlp.document import Document, _AnnotationSetsDict
from gatenlp.annotation_set import AnnotationSet
from gatenlp.annotation import Annotation
from gatenlp.changelog import ChangeLog


[docs]class JsonSerializer:
[docs] @staticmethod def save(clazz, inst, to_file=None, to_mem=None, offset_type=None, offset_mapper=None, **kwargs): d = inst.to_dict(offset_type=offset_type, offset_mapper=offset_mapper, **kwargs) if to_mem: return json.dumps(d) else: with open(to_file, "wt") as outfp: json.dump(d, outfp)
[docs] @staticmethod def load(clazz, from_file=None, from_mem=None, offset_mapper=None, **kwargs): if from_mem: d = json.loads(from_mem) doc = clazz.from_dict(d, offset_mapper=offset_mapper, **kwargs) else: with open(from_file, "rt") as infp: d = json.load(infp) # print("DEBUG: dict=", d) doc = clazz.from_dict(d, offset_mapper=offset_mapper, **kwargs) return doc
MSGPACK_VERSION_HDR = "sm1"
[docs]class MsgPackSerializer:
[docs] @staticmethod def document2stream(doc: Document, stream): pack(MSGPACK_VERSION_HDR, stream) pack(doc.offset_type, stream) pack(doc.text, stream) pack(doc._features, stream) pack(len(doc.annotation_sets), stream) for name, annset in doc.annotation_sets: pack(name, stream) pack(annset.next_annid, stream) pack(len(annset), stream) for ann in annset.fast_iter(): pack(ann.type, stream) pack(ann.start, stream) pack(ann.end, stream) pack(ann.id, stream) pack(ann.features, stream)
[docs] @staticmethod def stream2document(stream): u = Unpacker(stream) version = u.unpack() if version != MSGPACK_VERSION_HDR: raise Exception("MsgPack data starts with wrong version") doc = Document() doc.offset_type = u.unpack() doc._text = u.unpack() doc._features = u.unpack() nsets = u.unpack() setsdict = _AnnotationSetsDict(owner_doc=doc) doc.annotation_sets = setsdict for iset in range(nsets): sname = u.unpack() if sname is None: sname = "" annset = AnnotationSet(name=sname, owner_doc=doc) annset._next_annid = u.unpack() nanns = u.unpack() for iann in range(nanns): atype = u.unpack() astart = u.unpack() aend = u.unpack() aid = u.unpack() afeatures = u.unpack() ann = Annotation(astart, aend, atype, annid=aid, features=afeatures) annset._annotations[aid] = ann setsdict[sname] = annset return doc
[docs] @staticmethod def save(clazz, inst, to_file=None, to_mem=None, offset_type=None, offset_mapper=None, **kwargs): if isinstance(inst, Document): writer = MsgPackSerializer.document2stream elif isinstance(inst, ChangeLog): raise Exception("Not implemented yet") else: raise Exception("Object not supported") if to_mem: f = io.BytesIO() else: f = open(to_file, "wb") writer(inst, f) if to_mem: return f.getvalue() else: f.close()
[docs] @staticmethod def load(clazz, from_file=None, from_mem=None, offset_mapper=None, **kwargs): if clazz == Document: reader = MsgPackSerializer.stream2document elif clazz == ChangeLog: raise Exception("Not implemented yet") else: raise Exception("Object not supported") if from_mem: f = io.BytesIO(from_mem) else: f = open(from_file, "rb") doc = reader(f) return doc
FORMATS = dict(json=JsonSerializer, msgpack=MsgPackSerializer)