Package gatenlp

Expand source code
# NOTE: do not place a comment at the end of the version assignment
# line since we parse that line in a shell script!
__version__ = "0.9.6"
import logging
import sys
try:
    import sortedcontainers
except Exception as ex:
    import sys
    print("ERROR: required package sortedcontainers cannot be imported!", file=sys.stderr)
    print("Please install it, using e.g. 'pip install -U sortedcontainers'", file=sys.stderr)
    sys.exit(1)
# TODO: check version of sortedcontainers (we have 2.1.0)

from gatenlp.utils import init_logger
logger = init_logger("gatenlp")

# this attribute globally holds the processing resource last defined
# so it can be used for interacting with the GATE python plugin
from gatenlp.gate_interaction import _pr_decorator as GateNlpPr
from gatenlp.gate_interaction import interact
from gatenlp.annotation import Annotation
from gatenlp.document import Document
from gatenlp.annotation_set import AnnotationSet
from gatenlp.changelog import ChangeLog
from gatenlp.gateslave import GateSlave


def  init_notebook():
    from gatenlp.serialization.default import HtmlAnnViewerSerializer
    from gatenlp.gatenlpconfig import gatenlpconfig
    HtmlAnnViewerSerializer.init_javscript()
    gatenlpconfig.notebook_js_initialized = True


__all__ = ["GateNlpPr", "Annotation", "Document", "AnnotationSet",
           "ChangeLog", "logger"]

gate_python_plugin_pr = None

Sub-modules

gatenlp.annotation

Module for Annotation class which represents information about a span of text in a document.

gatenlp.annotation_set

Module for AnnotationSet class which represents a named collection of annotations which can arbitrarily overlap.

gatenlp.changelog

Module for ChangeLog class which represents a log of changes to any of the components of a Document: document features, annotations, annotation features.

gatenlp.corpora

Module that defines base and implementation classes for representing document collections …

gatenlp.document

Module that implements the Document class for representing gatenlp documents with features and annotation sets.

gatenlp.features

Module that implements class Feature for representing features.

gatenlp.gate_interaction

Support for interacting between a GATE (java) process and a gatenlp (Python) process. This is used by the Java GATE Python plugin.

gatenlp.gatenlpconfig

Module that provides the class GatenlpConfig and the instance gatenlpconfig which stores various global configuration options.

gatenlp.gateslave

Module for interacting with a Java GATE process, running API commands on it and exchanging data with it.

gatenlp.impl

This subpackage contains modules for (temporary) implementation of datastructures and algorithms needed. Some of these may get replaced by other …

gatenlp.lang
gatenlp.lib_spacy

Support for using spacy: convert from spacy to gatenlp documents and annotations.

gatenlp.lib_stanfordnlp

Support for using stanfordnlp: convert from stanfordnlp output to gatenlp documents and annotations.

gatenlp.lib_stanza

Support for using stanford stanza (see https://stanfordnlp.github.io/stanza/): convert from stanford Stanza output to gatenlp documents and annotations.

gatenlp.offsetmapper

Module that implements the OffsetMapper class for mapping between Java-style and Python-style string offsets. Java strings are represented as UTF16 …

gatenlp.processing
gatenlp.serialization
gatenlp.utils

Various utilities that could be useful in several modules.

Functions

def GateNlpPr(what)

This is the decorator to identify a class or function as a processing resource. This is made available with the name PR in the gatenlp package.

This creates an instance of PRWrapper and registers all the relevant functions of the decorated class or the decorated function in the wrapper.

Args

what
the class or function to decorate.

Returns

modified class or function

Expand source code
def _pr_decorator(what):
    """
    This is the decorator to identify a class or function as a processing
    resource. This is made available with the name PR in the gatenlp
    package.
    
    This creates an instance of PRWrapper and registers all the relevant
    functions of the decorated class or the decorated function in the
    wrapper.

    Args:
      what: the class or function to decorate.

    Returns:
      modified class or function

    """
    gatenlp.gate_python_plugin_pr = "The PR from here!!!"

    wrapper = _PrWrapper()
    if inspect.isclass(what) or _has_method(what, "__call__"):
        if inspect.isclass(what):
            what = what()   # if it is a class, create an instance, otherwise assume it is already an instance
        # TODO: instead of this we could just as well store the instance and 
        # directly call the instance methods from the wrapper!
        execmethod = _has_method(what, "__call__")
        if not execmethod:
            raise Exception("PR does not have a __call__(doc) method.")
        allowkws = _check_exec(execmethod)
        wrapper.func_execute_allowkws = allowkws
        wrapper.func_execute = execmethod
        startmethod = _has_method(what, "start")
        if startmethod:
            wrapper.func_start = startmethod
            if inspect.getfullargspec(startmethod).varkw:
                wrapper.func_start_allowkws = True
        finishmethod = _has_method(what, "finish")
        if finishmethod:
            wrapper.func_finish = finishmethod
            if inspect.getfullargspec(finishmethod).varkw:
                wrapper.func_finish_allowkws = True
        reducemethod = _has_method(what, "reduce")
        if reducemethod:
            wrapper.func_reduce = reducemethod
            if inspect.getfullargspec(reducemethod).varkw:
                wrapper.func_reduce_allowkws = True

    elif inspect.isfunction(what):
        allowkws = _check_exec(what)
        wrapper.func_execute = what
        wrapper.func_execute_allowkws = allowkws
    else:
        raise Exception(f"Decorator applied to something that is not a function or class: {what}")
    gatenlp.gate_python_plugin_pr = wrapper
    return wrapper

Classes

class Annotation (start: int, end: int, anntype: str, features=None, annid: int = 0)

An annotation represents information about a span of text. It contains the start and end offsets of the span, an "annotation type" and an arbitrary number of features.

In addition it contains an id which has no meaning for the annotation itself but is used to uniquely identify an annotation within the set it is contained in.

All fields except the features are immutable, once the annotation has been created only the features can be changed.

This constructor creates a new annotation instance. Once an annotation has been created, the start, end, type and id fields cannot be changed.

NOTE: this should almost never be done directly and instead the method AnnotationSet.add should be used.

Args

start
start offset of the annotation
end
end offset of the annotation
anntype
annotation type
features
an initial collection of features, None for no features.
annid
the id of the annotation
Expand source code
class Annotation:
    """
    An annotation represents information about a span of text. It contains the start and end
    offsets of the span, an "annotation type" and an arbitrary number of features.

    In addition it contains an id which has no meaning for the annotation itself but is
    used to uniquely identify an annotation within the set it is contained in.

    All fields except the features are immutable, once the annotation has been created
    only the features can be changed.
    """

    def __init__(
            self, start: int, end: int, anntype: str,
            features=None,
            annid: int = 0
    ):
        """
        This constructor creates a new annotation instance. Once an annotation has been created,
        the start, end, type and id fields cannot be changed.

        NOTE: this should almost never be done directly
        and instead the method AnnotationSet.add should be used.

        Args:
            start: start offset of the annotation
            end: end offset of the annotation
            anntype: annotation type
            features: an initial collection of features, None for no features.
            annid: the id of the annotation
        """
        if end < start:
            raise Exception(f"Cannot create annotation start={start}, end={end}, type={anntype}, id={annid}, features={features}: start > end")
        if not isinstance(annid, int):
            raise Exception(f"Cannot create annotation start={start}, end={end}, type={anntype}, id={annid}, features={features}: annid is not an int")
        if isinstance(features, int):
            raise Exception(f"Cannot create annotation start={start}, end={end}, type={anntype}, id={annid}, features={features}: features must not be an int")
        # super().__init__(features)
        if annid is not None and not isinstance(annid, int):
            raise Exception("Parameter annid must be an int, mixed up with features?")
        if features is not None and isinstance(features, int):
            raise Exception("Parameter features must not be an int: mixed up with annid?")
        self._owner_set = None
        self._features = Features(features, logger=self._log_feature_change)
        self._type = anntype
        self._start = start
        self._end = end
        self._id = annid

    @property
    def type(self) -> str:
        """
        Returns the annotation type.
        """
        return self._type

    @property
    def start(self) -> int:
        """
        Returns the start offset.
        """
        return self._start

    @property
    def end(self):
        """
        Returns the end offset.
        """
        return self._end

    @property
    def features(self):
        """
        Returns the features for the annotation.
        """
        return self._features

    @property
    def id(self):
        """
        Returns the annotation id.
        """
        return self._id

    @property
    def span(self) -> Tuple[int, int]:
        """
        Returns a tuple with the start and end offset of the annotation.
        """
        return self.start, self.end

    def _changelog(self):
        if self._owner_set is not None:
            return self._owner_set.changelog

    # TODO: for now at least, make sure only simple JSON serialisable things are used! We do NOT
    # allow any user specific types in order to make sure what we create is interchangeable with GATE.
    # In addition we do NOT allow None features.
    # So a feature name always has to be a string (not None), the value has to be anything that is json
    # serialisable (except None keys for maps).
    # For performance reasons we check the feature name but not the value (maybe make checking optional
    # on by default but still optional?)
    def _log_feature_change(self, command: str, feature: str = None, value=None) -> None:
        """

        Args:
          command: str: 
          feature: str:  (Default value = None)
          value:  (Default value = None)

        Returns:

        """
        if self._changelog() is None:
            return
        command = "ann-"+command
        ch = {
            "command": command,
            "type": "annotation",
            "set": self._owner_set.name,
            "id": self.id}
        if feature is not None:
            ch["feature"] = feature
        if value is not None:
            ch["value"] = value
        self._changelog().append(ch)

    def __eq__(self, other) -> bool:
        """
        Two annotations are identical if they are the same object or if all the fields
        are equal.
        """
        if not isinstance(other, Annotation):
            return False
        if self is other:
            return True
        return self.start == other.start and self.end == other.end and \
               self.type == other.type and self.id == other.id and self._features == other._features

    def __hash__(self):
        """
        The hash depends on the annotation ID and the owning set.
        """
        return hash((self.id, self._owner_set))

    def __lt__(self, other) -> bool:
        """
        Comparison for sorting: this sorts by increasing start offset,  then increasing annotation id.
        Since annotation ids within a set are unique, this guarantees a unique order of annotations that
        come from an annotation set.

        Note: for now the other object has to be an instance of Annotation, duck typing is not supported!
        """
        if not isinstance(other, Annotation):
            raise Exception("Cannot compare to non-Annotation")
        if self.start < other.start:
            return True
        elif self.start > other.start:
            return False
        else:
            return self.id < other.id

    def __repr__(self) -> str:
        """
        String representation of the annotation.
        """
        return "Annotation({},{},{},features={},id={})".format(self.start, self.end, self.type, self._features, self.id)

    @property
    def length(self) -> int:
        """
        Returns the length of the annotation: this is the length of the offset span.
        Since the end offset is one after the last
        element, we return end-start. Note: this is deliberately not implemented as len(ann), as
        len(annset) returns the number of annotations in the set but annset.length() also returns the
        span length of the annotation set, so the method name for this is identical between annotations
        and annotation sets.
        """
        return self.end - self.start

    @support_annotation_or_set
    def isoverlapping(self, start: int, end: int) -> bool:
        """
        Checks if this annotation is overlapping with the given span, annotation or
        annotation set.
        An annotation is overlapping with a span if the first or last character
        is inside that span.

        Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
          (see gatenlp._utils.support_annotation_or_set)

        Args:
          start: start offset of the span
          end: end offset of the span

        Returns:
          `True` if overlapping, `False` otherwise

        """
        return self.iscovering(start) or self.iscovering(end - 1)

    @support_annotation_or_set
    def iscoextensive(self, start: int, end: int) -> bool:
        """
        Checks if this annotation is coextensive with the given span, annotation or
        annotation set, i.e. has exactly the same start and end offsets.

        Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
          (see gatenlp._utils.support_annotation_or_set)

        Args:
          start: start offset of the span
          end: end offset of the span

        Returns:
          `True` if coextensive, `False` otherwise

        """
        return self.start == start and self.end == end

    @support_annotation_or_set
    def iswithin(self, start: int, end: int) -> bool:
        """
        Checks if this annotation is within the given span, annotation or
        annotation set, i.e. both the start and end offsets of this annotation
        are after the given start and before the given end.

        Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
          (see gatenlp._utils.support_annotation_or_set)

        Args:
          start: start offset of the span
          end: end offset of the span

        Returns:
          `True` if within, `False` otherwise

        """
        return start <= self.start and end >= self.end

    @support_annotation_or_set
    def isbefore(self, start: int, end: int, immediately=False) -> bool:
        """
        Checks if this annotation is before the other span, i.e. the end of this annotation
        is before the start of the other annotation or span.

        Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
          (see gatenlp._utils.support_annotation_or_set)

        Args:
          start: start offset of the span
          end: end offset of the span
          immediately: if true checks if this annotation ends immediately before the other one (Default value = False)

        Returns:
          True if before, False otherwise

        """
        if immediately:
            return self.end == start
        else:
            return self.end <= start

    @support_annotation_or_set
    def isafter(self, start: int, end: int, immediately=False) -> bool:
        """Checks if this annotation is after the other span, i.e. the start of this annotation
        is after the end of the other annotation or span.

        Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
          (see gatenlp._utils.support_annotation_or_set)

        Args:
          start: start offset of the span
          end: end offset of the span
          immediately: if true checks if this annotation starts immediately after the other one (Default value = False)

        Returns:
          True if after, False otherwise

        """
        if immediately:
            return self.start == end
        else:
            return self.start >= end

    @support_annotation_or_set
    def gap(self, start: int, end: int):
        """Return the gep between this annotation and the other annotation. This is the distance between
        the last character of the first annotation and the first character of the second annotation in
        sequence, so it is always independent of the order of the two annotations.
        
        This is negative if the annotations overlap.

        Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
          (see gatenlp._utils.support_annotation_or_set)

        Args:
          start: start offset of span
          end: end offset of span

        Returns:
          size of gap

        """
        if self.start < start:
            ann1start = self.start
            ann1end = self.end
            ann2start = start
            ann2end = end
        else:
            ann2start = self.start
            ann2end = self.end
            ann1start = start
            ann1end = end
        return ann2start - ann1end

    @support_annotation_or_set
    def iscovering(self, start: int, end: int = None) -> bool:
        """Checks if this annotation is covering the given span, annotation or
        annotation set, i.e. both the given start and end offsets
        are after the start of this annotation and before the end of this annotation.
        
        If end is not given, then the method checks if start is an offset of a character
        contained in the span.

        Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
          (see gatenlp._utils.support_annotation_or_set)

        Args:
          start: start offset of the span
          end: end offset of the span

        Returns:
          True if covering, False otherwise

        """
        if end is None:
            return self.start <= start < self.end
        else:
            return self.start <= start and self.end >= end

    def to_dict(self, offset_mapper=None, offset_type=None):
        """
        Return a representation of this annotation as a nested map. This representation is
        used for several serialization methods.

        Args:
            offset_mapper: the offset mapper to use, must be specified if `offset_type` is specified.
            offset_type: the offset type to be used for the conversionm must be specified if
               `offset_mapper` is specified

        Returns:
            the dictionary representation of the Annotation
        """
        if (offset_mapper and not offset_type) or (not offset_mapper and offset_type):
            raise Exception("offset_mapper and offset_type must be specified both or none")
        if offset_mapper is not None:
            if offset_type == OFFSET_TYPE_JAVA:
                start = offset_mapper.convert_to_java(self._start)
                end = offset_mapper.convert_to_java(self._end)
            elif offset_type == OFFSET_TYPE_PYTHON:
                start = offset_mapper.convert_to_python(self._start)
                end = offset_mapper.convert_to_python(self._end)
            else:
                raise Exception(f"Not a valid offset type: {offset_type}, must be 'p' or 'j'")
        else:
            start = self._start
            end = self._end
        return {
            "type": self.type,
            "start": start,
            "end": end,
            "id": self.id,
            "features": self._features.to_dict(),
        }

    @staticmethod
    def from_dict(dictrepr, owner_set=None, **kwargs):
        """
        Construct an annotation object from the dictionary representation.

        Args:
          dictrepr: dictionary representation
          owner_set: the owning set the annotation should have (Default value = None)
          kwargs: ignored
        """
        ann = Annotation(
            start=dictrepr.get("start"),
            end=dictrepr.get("end"),
            anntype=dictrepr.get("type"),
            annid=dictrepr.get("id"),
            features=dictrepr.get("features")
        )
        ann._owner_set = owner_set
        return ann

    def __copy__(self):
        return Annotation(self._start, self._end, self._type, annid=self._id, features=self._features)

    def copy(self):
        """
        Return a shallow copy of the annotation (features are shared).
        """
        return self.__copy__()

    def __deepcopy__(self, memo=None):
        if self._features is not None:
            fts = lib_copy.deepcopy(self._features.to_dict(), memo=memo)
        else:
            fts = None
        return Annotation(self._start, self._end, self._type, annid=self._id, features=fts)

    def deepcopy(self, memo=None):
        """
        Return a deep copy of the annotation (features and their values are copied as well).
        """
        return lib_copy.deepcopy(self, memo=memo)

Static methods

def from_dict(dictrepr, owner_set=None, **kwargs)

Construct an annotation object from the dictionary representation.

Args

dictrepr
dictionary representation
owner_set
the owning set the annotation should have (Default value = None)
kwargs
ignored
Expand source code
@staticmethod
def from_dict(dictrepr, owner_set=None, **kwargs):
    """
    Construct an annotation object from the dictionary representation.

    Args:
      dictrepr: dictionary representation
      owner_set: the owning set the annotation should have (Default value = None)
      kwargs: ignored
    """
    ann = Annotation(
        start=dictrepr.get("start"),
        end=dictrepr.get("end"),
        anntype=dictrepr.get("type"),
        annid=dictrepr.get("id"),
        features=dictrepr.get("features")
    )
    ann._owner_set = owner_set
    return ann

Instance variables

property/get end

Returns the end offset.

Expand source code
@property
def end(self):
    """
    Returns the end offset.
    """
    return self._end
property/get features

Returns the features for the annotation.

Expand source code
@property
def features(self):
    """
    Returns the features for the annotation.
    """
    return self._features
property/get id

Returns the annotation id.

Expand source code
@property
def id(self):
    """
    Returns the annotation id.
    """
    return self._id
property/get length : int

Returns the length of the annotation: this is the length of the offset span. Since the end offset is one after the last element, we return end-start. Note: this is deliberately not implemented as len(ann), as len(annset) returns the number of annotations in the set but annset.length() also returns the span length of the annotation set, so the method name for this is identical between annotations and annotation sets.

Expand source code
@property
def length(self) -> int:
    """
    Returns the length of the annotation: this is the length of the offset span.
    Since the end offset is one after the last
    element, we return end-start. Note: this is deliberately not implemented as len(ann), as
    len(annset) returns the number of annotations in the set but annset.length() also returns the
    span length of the annotation set, so the method name for this is identical between annotations
    and annotation sets.
    """
    return self.end - self.start
property/get span : Tuple[int, int]

Returns a tuple with the start and end offset of the annotation.

Expand source code
@property
def span(self) -> Tuple[int, int]:
    """
    Returns a tuple with the start and end offset of the annotation.
    """
    return self.start, self.end
property/get start : int

Returns the start offset.

Expand source code
@property
def start(self) -> int:
    """
    Returns the start offset.
    """
    return self._start
property/get type : str

Returns the annotation type.

Expand source code
@property
def type(self) -> str:
    """
    Returns the annotation type.
    """
    return self._type

Methods

def copy(self)

Return a shallow copy of the annotation (features are shared).

Expand source code
def copy(self):
    """
    Return a shallow copy of the annotation (features are shared).
    """
    return self.__copy__()
def deepcopy(self, memo=None)

Return a deep copy of the annotation (features and their values are copied as well).

Expand source code
def deepcopy(self, memo=None):
    """
    Return a deep copy of the annotation (features and their values are copied as well).
    """
    return lib_copy.deepcopy(self, memo=memo)
def gap(self, start: int, end: int)

Return the gep between this annotation and the other annotation. This is the distance between the last character of the first annotation and the first character of the second annotation in sequence, so it is always independent of the order of the two annotations.

This is negative if the annotations overlap.

Note: this can be called with an Annotation or AnnotationSet instead of start and end (see gatenlp._utils.support_annotation_or_set)

Args

start
start offset of span
end
end offset of span

Returns

size of gap

Expand source code
@support_annotation_or_set
def gap(self, start: int, end: int):
    """Return the gep between this annotation and the other annotation. This is the distance between
    the last character of the first annotation and the first character of the second annotation in
    sequence, so it is always independent of the order of the two annotations.
    
    This is negative if the annotations overlap.

    Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
      (see gatenlp._utils.support_annotation_or_set)

    Args:
      start: start offset of span
      end: end offset of span

    Returns:
      size of gap

    """
    if self.start < start:
        ann1start = self.start
        ann1end = self.end
        ann2start = start
        ann2end = end
    else:
        ann2start = self.start
        ann2end = self.end
        ann1start = start
        ann1end = end
    return ann2start - ann1end
def isafter(self, start: int, end: int, immediately=False) ‑> bool

Checks if this annotation is after the other span, i.e. the start of this annotation is after the end of the other annotation or span.

Note: this can be called with an Annotation or AnnotationSet instead of start and end (see gatenlp._utils.support_annotation_or_set)

Args

start
start offset of the span
end
end offset of the span
immediately
if true checks if this annotation starts immediately after the other one (Default value = False)

Returns

True if after, False otherwise

Expand source code
@support_annotation_or_set
def isafter(self, start: int, end: int, immediately=False) -> bool:
    """Checks if this annotation is after the other span, i.e. the start of this annotation
    is after the end of the other annotation or span.

    Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
      (see gatenlp._utils.support_annotation_or_set)

    Args:
      start: start offset of the span
      end: end offset of the span
      immediately: if true checks if this annotation starts immediately after the other one (Default value = False)

    Returns:
      True if after, False otherwise

    """
    if immediately:
        return self.start == end
    else:
        return self.start >= end
def isbefore(self, start: int, end: int, immediately=False) ‑> bool

Checks if this annotation is before the other span, i.e. the end of this annotation is before the start of the other annotation or span.

Note: this can be called with an Annotation or AnnotationSet instead of start and end (see gatenlp._utils.support_annotation_or_set)

Args

start
start offset of the span
end
end offset of the span
immediately
if true checks if this annotation ends immediately before the other one (Default value = False)

Returns

True if before, False otherwise

Expand source code
@support_annotation_or_set
def isbefore(self, start: int, end: int, immediately=False) -> bool:
    """
    Checks if this annotation is before the other span, i.e. the end of this annotation
    is before the start of the other annotation or span.

    Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
      (see gatenlp._utils.support_annotation_or_set)

    Args:
      start: start offset of the span
      end: end offset of the span
      immediately: if true checks if this annotation ends immediately before the other one (Default value = False)

    Returns:
      True if before, False otherwise

    """
    if immediately:
        return self.end == start
    else:
        return self.end <= start
def iscoextensive(self, start: int, end: int) ‑> bool

Checks if this annotation is coextensive with the given span, annotation or annotation set, i.e. has exactly the same start and end offsets.

Note: this can be called with an Annotation or AnnotationSet instead of start and end (see gatenlp._utils.support_annotation_or_set)

Args

start
start offset of the span
end
end offset of the span

Returns

True if coextensive, False otherwise

Expand source code
@support_annotation_or_set
def iscoextensive(self, start: int, end: int) -> bool:
    """
    Checks if this annotation is coextensive with the given span, annotation or
    annotation set, i.e. has exactly the same start and end offsets.

    Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
      (see gatenlp._utils.support_annotation_or_set)

    Args:
      start: start offset of the span
      end: end offset of the span

    Returns:
      `True` if coextensive, `False` otherwise

    """
    return self.start == start and self.end == end
def iscovering(self, start: int, end: int = None) ‑> bool

Checks if this annotation is covering the given span, annotation or annotation set, i.e. both the given start and end offsets are after the start of this annotation and before the end of this annotation.

If end is not given, then the method checks if start is an offset of a character contained in the span.

Note: this can be called with an Annotation or AnnotationSet instead of start and end (see gatenlp._utils.support_annotation_or_set)

Args

start
start offset of the span
end
end offset of the span

Returns

True if covering, False otherwise

Expand source code
@support_annotation_or_set
def iscovering(self, start: int, end: int = None) -> bool:
    """Checks if this annotation is covering the given span, annotation or
    annotation set, i.e. both the given start and end offsets
    are after the start of this annotation and before the end of this annotation.
    
    If end is not given, then the method checks if start is an offset of a character
    contained in the span.

    Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
      (see gatenlp._utils.support_annotation_or_set)

    Args:
      start: start offset of the span
      end: end offset of the span

    Returns:
      True if covering, False otherwise

    """
    if end is None:
        return self.start <= start < self.end
    else:
        return self.start <= start and self.end >= end
def isoverlapping(self, start: int, end: int) ‑> bool

Checks if this annotation is overlapping with the given span, annotation or annotation set. An annotation is overlapping with a span if the first or last character is inside that span.

Note: this can be called with an Annotation or AnnotationSet instead of start and end (see gatenlp._utils.support_annotation_or_set)

Args

start
start offset of the span
end
end offset of the span

Returns

True if overlapping, False otherwise

Expand source code
@support_annotation_or_set
def isoverlapping(self, start: int, end: int) -> bool:
    """
    Checks if this annotation is overlapping with the given span, annotation or
    annotation set.
    An annotation is overlapping with a span if the first or last character
    is inside that span.

    Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
      (see gatenlp._utils.support_annotation_or_set)

    Args:
      start: start offset of the span
      end: end offset of the span

    Returns:
      `True` if overlapping, `False` otherwise

    """
    return self.iscovering(start) or self.iscovering(end - 1)
def iswithin(self, start: int, end: int) ‑> bool

Checks if this annotation is within the given span, annotation or annotation set, i.e. both the start and end offsets of this annotation are after the given start and before the given end.

Note: this can be called with an Annotation or AnnotationSet instead of start and end (see gatenlp._utils.support_annotation_or_set)

Args

start
start offset of the span
end
end offset of the span

Returns

True if within, False otherwise

Expand source code
@support_annotation_or_set
def iswithin(self, start: int, end: int) -> bool:
    """
    Checks if this annotation is within the given span, annotation or
    annotation set, i.e. both the start and end offsets of this annotation
    are after the given start and before the given end.

    Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end`
      (see gatenlp._utils.support_annotation_or_set)

    Args:
      start: start offset of the span
      end: end offset of the span

    Returns:
      `True` if within, `False` otherwise

    """
    return start <= self.start and end >= self.end
def to_dict(self, offset_mapper=None, offset_type=None)

Return a representation of this annotation as a nested map. This representation is used for several serialization methods.

Args

offset_mapper
the offset mapper to use, must be specified if offset_type is specified.
offset_type
the offset type to be used for the conversionm must be specified if offset_mapper is specified

Returns

the dictionary representation of the Annotation

Expand source code
def to_dict(self, offset_mapper=None, offset_type=None):
    """
    Return a representation of this annotation as a nested map. This representation is
    used for several serialization methods.

    Args:
        offset_mapper: the offset mapper to use, must be specified if `offset_type` is specified.
        offset_type: the offset type to be used for the conversionm must be specified if
           `offset_mapper` is specified

    Returns:
        the dictionary representation of the Annotation
    """
    if (offset_mapper and not offset_type) or (not offset_mapper and offset_type):
        raise Exception("offset_mapper and offset_type must be specified both or none")
    if offset_mapper is not None:
        if offset_type == OFFSET_TYPE_JAVA:
            start = offset_mapper.convert_to_java(self._start)
            end = offset_mapper.convert_to_java(self._end)
        elif offset_type == OFFSET_TYPE_PYTHON:
            start = offset_mapper.convert_to_python(self._start)
            end = offset_mapper.convert_to_python(self._end)
        else:
            raise Exception(f"Not a valid offset type: {offset_type}, must be 'p' or 'j'")
    else:
        start = self._start
        end = self._end
    return {
        "type": self.type,
        "start": start,
        "end": end,
        "id": self.id,
        "features": self._features.to_dict(),
    }
class AnnotationSet (name: str = '', owner_doc: Document = None)

Creates an annotation set. This should not be used directly by the user, instead the method Document.annset()(name) should be used to access the annotation set with a given name from the document.

An annotation set contains an arbitrary number of annotations, which can overlap in arbitrary ways. Each annotation set has a name and a document can have as many named annotation sets as needed.

Args

name
the name of the annotation set, default: the empty string (default annotation set)
owner_doc
if this is set, the set and all sets created from it can be queried for the owning document and offsets get checked against the text of the owning document, if it has text. Also, the changelog is only updated if an annotation set has an owning document.
Expand source code
class AnnotationSet:
    def __init__(self, name: str = "", owner_doc: "Document" = None):
        """
        Creates an annotation set. This should not be used directly by the user, instead the
        method `Document.annset(name)` should be used to access the annotation set with a given
        name from the document.

        An annotation set contains an arbitrary number of annotations, which can overlap in arbitrary ways.
        Each annotation set has a name and a document can have as many named annotation sets as needed.


        Args:
          name: the name of the annotation set, default: the empty string (default annotation set)
          owner_doc: if this is set, the set and all sets created from it can be queried for the
              owning document and offsets get checked against the text of the owning document, if it has
              text. Also, the changelog is only updated if an annotation set has an owning document.
        """
        # print("CREATING annotation set {} with changelog {} ".format(name, changelog), file=sys.stderr)
        self._name = name
        self._owner_doc = owner_doc
        self._index_by_offset = None
        self._index_by_type = None
        # internally we represent the annotations as a map from annotation id (int) to Annotation
        self._annotations = {}
        self._is_immutable = False
        self._next_annid = 0

    @property
    def name(self):
        """
        Returns the name of the annotation set.

        Note: the name of a set cannot be changed.
        """
        return self._name

    @property
    def changelog(self):
        """
        Returns the changelog or None if no changelog is set.
        """
        if self._owner_doc is None:
            return None
        return self._owner_doc.changelog

    def __setattr__(self, key, value):
        """
        Prevent immutable fields from getting overridden, once they have been
        set.
        """
        if key == "name" or key == "owner_doc":
            if self.__dict__.get(key, None) is None:
                super().__setattr__(key, value)
            else:
                raise Exception("AnnotationSet attribute cannot get changed after being set")
        else:
            super().__setattr__(key, value)

    def detach(self, restrict_to=None) -> "AnnotationSet":
        """
        Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids.
        A detached annotation set does not have an owning document and deleting or adding annotations does not
        change the annotations stored with the document. However, the annotations in a detached annotation set
        are the same as those stored in the attached set, so updating their features will modify the annotations
        in the document as well.

        Args:
          restrict_to: an iterable of annotation ids, if None, all the annotations from this set.

        Returns:
          an immutable annotation set
        """
        annset = AnnotationSet(name="detached-from:"+self.name)
        annset._is_immutable = True
        if restrict_to is None:
            annset._annotations = {annid: self._annotations[annid] for annid in self._annotations.keys()}
        else:
            annset._annotations = {annid: self._annotations[annid] for annid in restrict_to}
        annset._next_annid = self._next_annid
        return annset

    def detach_from(self, anns: Iterable) -> "AnnotationSet":
        """
        Creates an immutable detached annotation set from the annotations in anns which could by
        either a collection of annotations or annotation ids (int numbers) which are assumed to
        be the annotation ids from this set.
        
        The next annotation id for the created set is the highest seen annotation id from anns plus one.

        Args:
          anns: an iterable of annotations

        Returns:
          an immutable detached annotation set
        """
        annset = AnnotationSet(name="detached-from:"+self.name)
        annset._is_immutable = True
        annset._annotations = {}
        nextid = -1
        for ann in anns:
            if isinstance(ann, int):
                annset._annotations[ann] = self._annotations[ann]
                annid = ann
            else:
                annset._annotations[id] = ann
                annid = ann.id
            if annid > nextid:
                nextid = annid
        annset._next_annid = nextid + 1
        return annset

    @property
    def immutable(self) -> bool:
        """
        Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added
        or removed from the set, but the annotations themselves can still have their features modified.

        All detached annotation sets are immutable when created, but can be made mutable afterwards.
        """
        return self._is_immutable

    @immutable.setter
    def immutable(self, val: bool) -> None:
        self._is_immutable = val

    def isdetached(self) -> bool:
        """
        Returns True if the annotation set is detached, False otherwise.
        """
        return self._owner_doc is None

    def _create_index_by_offset(self) -> None:
        """
        Generates the offset index, if it does not already exist.
        The offset index is an interval tree that stores the annotation ids for the offset interval of the annotation.
        """
        if self._index_by_offset is None:
            self._index_by_offset = SortedIntvls()
            for ann in self._annotations.values():
                self._index_by_offset.add(ann.start, ann.end, ann.id)

    def _create_index_by_type(self) -> None:
        """Generates the type index, if it does not already exist. The type index is a map from
        annotation type to a set of all annotation ids with that type.
        """
        if self._index_by_type is None:
            self._index_by_type = defaultdict(set)
            for ann in self._annotations.values():
                self._index_by_type[ann.type].add(ann.id)

    def _add_to_indices(self, annotation: Annotation) -> None:
        """
        If we have created the indices, add the annotation to them.

        Args:
          annotation: the annotation to add to the indices.
          annotation: Annotation: 
        """
        if self._index_by_type is not None:
            self._index_by_type[annotation.type].add(annotation.id)
        if self._index_by_offset is not None:
            self._index_by_offset.add(annotation.start, annotation.end, annotation.id)

    def _remove_from_indices(self, annotation: Annotation) -> None:
        """Remove an annotation from the indices.

        Args:
          annotation: the annotation to remove.
          annotation: Annotation: 
        """
        if self._index_by_offset is not None:
            self._index_by_offset.remove(annotation.start, annotation.end, annotation.id)
        if self._index_by_type is not None:
            self._index_by_type[annotation.type].remove(annotation.id)

    @staticmethod
    def _intvs2idlist(intvs, ignore=None) -> List[int]:
        """Convert an iterable of interval tuples (start, end, id) to a list of ids

        Args:
          intvs: iterable of interval tuples
          ignore: an optional annotation id that should not get included in the result (Default value = None)

        Returns:
          list of ids
        """
        if ignore is not None:
            return [i[2] for i in intvs if i[2] != ignore]
        else:
            return [i[2] for i in intvs]

    @staticmethod
    def _intvs2idset(intvs, ignore=None) -> Set[int]:
        """Convert an iterable of interval tuples (start, end, id) to a set of ids

        Args:
          intvs: iterable of interval tuples
          ignore:  (Default value = None)

        Returns:
          set of ids
        """
        ret = set()
        if ignore is not None:
            for i in intvs:
                if i[2] != ignore:
                    ret.add(i[2])
        else:
            for i in intvs:
                ret.add(i[2])
        return ret

    def _restrict_intvs(self, intvs, ignore=None) -> "AnnotationSet":
        """

        Args:
          intvs: 
          ignore:  (Default value = None)

        Returns:

        """
        return self.detach(restrict_to=AnnotationSet._intvs2idlist(intvs, ignore=ignore))

    def __len__(self) -> int:
        """
        Return number of annotations in the set.

        :return: number of annotations
        """
        return len(self._annotations)

    @property
    def size(self) -> int:
        """
        Returns the number of annotations in the annotation set.
        """
        return len(self._annotations)

    @property
    def document(self) -> Union["Document", None]:
        """
        Returns the owning document, if set. If the owning document was not set, returns None.
        """
        return self._owner_doc

    @support_annotation_or_set
    def _check_offsets(self, start: int, end: int, annid=None) -> None:
        """
        Checks the offsets for the given span/annotation against the document boundaries, if we know the owning
        document and if the owning document has text.

        Args:
          start: int: 
          end: int: 
          annid:  (Default value = None)
        """
        if self._owner_doc is None:
            return
        if self._owner_doc.text is None:
            return
        doc_size = len(self._owner_doc)

        if start < 0:
            raise InvalidOffsetError("Annotation starts before 0")
        if end < 0:
            raise InvalidOffsetError("Annotation ends before 0")
        if start > end:
            raise InvalidOffsetError("Annotation ends before it starts")
        if start > doc_size:
            raise InvalidOffsetError(
                "Annotation starts after document ends: start={}, docsize={}".format(start, doc_size))
        if end > doc_size:
            raise InvalidOffsetError(
                "Annotation ends after document ends: end={}, docsize={}".format(end, doc_size))

    @property
    def start(self):
        """
        Returns the smallest start offset of all annotations, i.e the start of the span of the whole set.
        This needs the index and creates it if necessary.

        Throws:
          an exception if there are no annotations in the set.
        """
        if self.size == 0:
            raise Exception("Annotation set is empty, cannot determine start offset")
        self._create_index_by_offset()
        return self._index_by_offset.min_start()

    @property
    def end(self):
        """
        Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation.
        This needs the index and creates it if necessary.
        
        Throws:
          an exception if there are no annotations in the set.
        """
        if self.size == 0:
            raise Exception("Annotation set is empty, cannot determine end offset")
        self._create_index_by_offset()
        return self._index_by_offset.max_end()

    @property
    def length(self):
        """
        Returns the the length of the annotation set span.

        Throws:
          an exception if there are no annotations in the set.
        """
        return self.end() - self.start()

    def add(self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None):
        """
        Adds an annotation to the set. Once an annotation has been added, the start and end offsets,
        the type, and the annotation id of the annotation are immutable.

        Args:
          start: start offset
          end: end offset
          anntype: the annotation type
          features: a map, an iterable of tuples or an existing feature map. In any case, the features are used
            to create a new feature map for this annotation. If the map is empty or this parameter is None, the
            annotation does not store any map at all.
          annid: the annotation id, if not specified the next free one for this set is used. NOTE: the id should
            normally left unspecified and get assigned automatically.

        Returns:
            the new annotation
        """
        if annid is not None and not isinstance(annid, int):
            raise Exception("Parameter annid must be an int, mixed up with features?")
        if features is not None and isinstance(features, int):
            raise Exception("Parameter features must not be an int: mixed up with annid?")
        if self._is_immutable:
            raise Exception("Cannot add an annotation to an immutable annotation set")
        self._check_offsets(start, end)
        if annid and annid in self._annotations:
            raise Exception("Cannot add annotation with id {}, already in set".format(annid))
        if annid is None:
            annid = self._next_annid
            self._next_annid = self._next_annid + 1
        ann = Annotation(start, end, anntype, features=features, annid=annid)
        ann._owner_set = self
        if not self._annotations:
            self._annotations = {}
        self._annotations[annid] = ann
        self._add_to_indices(ann)
        if self.changelog is not None:
            entry = {
                    "command": "annotation:add",
                    "set": self.name,
                    "start": ann.start,
                    "end": ann.end,
                    "type": ann.type,
                    "features": ann._features.to_dict(),
                    "id": ann.id
                }
            self.changelog.append(entry)
        return ann

    def add_ann(self, ann, annid: int = None):
        """
        Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or
        with the one given.

        Args:
          ann: the annotation to copy into the set
          annid: the annotation id, if not specified the next free one for this set is used. Note:
             the id should normally left unspecified and get assigned automatically.

        Returns:
          the added annotation
        """
        return self.add(ann.start, ann.end, ann.type, ann.features, annid=annid)

    def remove(self, annotation: Union[int, Annotation]) -> None:
        """
        Removes the given annotation which is either the id or the annotation instance.

        Throws:
            exception if the annotation set is immutable or the annotation is not in the set

        Args:
          annotation: either the id (int) or the annotation instance (Annotation)
        """
        annid = None  # make pycharm happy
        if self._is_immutable:
            raise Exception("Cannot remove an annotation from an immutable annotation set")
        if isinstance(annotation, int):
            annid = annotation
            if annid not in self._annotations:
                raise Exception("Annotation with id {} not in annotation set, cannot remove".format(annid))
            annotation = self._annotations[annid]
        elif isinstance(annotation, Annotation):
            annid = annotation.id
            if annid not in self._annotations:
                raise Exception("Annotation with id {} does not belong to this set, cannot remove".format(annid))
        # NOTE: once the annotation has been removed from the set, it could still be referenced
        # somewhere else and its features could get modified. In order to prevent logging of such changes,
        # the owning set gets cleared for the annotation
        annotation._owner_set = None
        del self._annotations[annid]
        if self.changelog is not None:
            self.changelog.append({
                "command": "annotation:remove",
                "set": self.name,
                "id": annid})
        self._remove_from_indices(annotation)

    def clear(self) -> None:
        """
        Removes all annotations from the set.
        """
        self._annotations.clear()
        self._index_by_offset = None
        self._index_by_type = None
        if self.changelog is not None:
            self.changelog.append({
                "command": "annotations:clear",
                "set": self.name})

    def clone_anns(self, memo=None):
        """
        Replaces the annotations in this set with deep copies of the originals. If this is a detached set,
        then this makes sure that any modifications to the annotations do not affect the original annotations
        in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect
        the annotations in this set any more. The owning set of the annotations that get cloned is cleared.

        Args:
          memo: for internal use by our __deepcopy__ implementation.
        """
        tmpdict = {}
        for annid, ann in self._annotations.items():
            newann = copy.deepcopy(ann, memo=memo)
            ann._owner_set = None
            tmpdict[annid] = newann
        for annid, ann in tmpdict.items():
            self._annotations[annid] = ann

    def __copy__(self):
        """
        NOTE: creating a copy always creates a detached set, but a mutable one.
        """
        c = self.detach()
        c._is_immutable = False
        return c

    def copy(self):
        """
        Returns a shallow copy of the annotation set.
        """
        return self.__copy__()

    def __deepcopy__(self, memo=None):
        if memo is None:
            memo = {}
        c = self.detach()
        c._is_immutable = False
        c.clone_anns(memo=memo)
        return c

    def deepcopy(self):
        """
        Returns a deep copy of the annotation set.
        """
        return copy.deepcopy(self)

    def __iter__(self) -> Iterator:
        """
        Yields all the annotations of the set.

        Important: using the iterator will always create the index if it is not already there!
        For fast iteration use fast_iter() which does not allow sorting or offset ranges.

        Yields:
            the annotations in document order
        """
        # return iter(self._annotations.values())
        return self.iter()

    def fast_iter(self) -> Generator:
        """
        Yields annotations in insertion order. This is faster then the default iterator and does not
        need to index (so if the index does not exist, it will not be built).
        """
        if self._annotations:
            for annid, ann in self._annotations.items():
                yield ann

    def iter(self,
             start_ge: Union[int, None] = None,
             start_lt: Union[None, int] = None,
             with_type: str = None,
             reverse: bool = False) -> Generator:
        """
        Yields annotations in document order, otionally limited
        by the other parameters. If two annoations start at the same offset, they are always
        ordered by increasing annotation id.

        Args:
          start_ge: the offset from where to start including annotations
          start_lt: the last offset to use as the starting offset of an annotation
          with_type: only annotations of this type
          reverse: process in reverse document order

        Yields:
          annotations in document order

        """

        if with_type is not None:
            allowedtypes = set()
            if isinstance(type, str):
                allowedtypes.add(with_type)
            else:
                for atype in with_type:
                    allowedtypes.add(atype)
        else:
            allowedtypes = None
        if not self._annotations:
            return
        maxoff = None
        if start_ge is not None:
            assert start_ge >= 0
        if start_lt is not None:
            assert start_lt >= 1
            maxoff = start_lt + 1
        if start_lt is not None and start_ge is not None:
            assert start_lt > start_ge
        self._create_index_by_offset()
        for _start, _end, annid in self._index_by_offset.irange(minoff=start_ge, maxoff=maxoff, reverse=reverse):
            if allowedtypes is not None and self._annotations[annid].type not in allowedtypes:
                continue
            yield self._annotations[annid]

    def reverse_iter(self, **kwargs):
        """
        Same as iter, but with the reverse parameter set to true.

        Args:
          kwargs: Same as for iter(), with revers=True fixed.
          **kwargs: will get passed on the Annotation.iter

        Returns:
          same result as iter()

        """
        return self.iter(reverse=True, **kwargs)

    def get(self, annid: Union[int, Annotation], default=None) -> Union[Annotation, None]:
        """Gets the annotation with the given annotation id or returns the given default.
        
        NOTE: for handling cases where legacy code still expects the add method to return
        an id and not the annotation, this will accept an annotation so the the frequent
        pattern still works:
        
           annid = annset.add(b,e,t).id
           ann = annset.get(annid)
        
        If an annotation is passed the annotation from the set with the id of that annotation is
        returned, if the annotation is from that set, this will return the same object, if it is
        still in the set (or return the default value).

        Args:
          annid: the annotation id of the annotation to retrieve.
          default: what to return if an annotation with the given id is not found. (Default value = None)
          annid: Union[int: 
          Annotation]: 

        Returns:
          the annotation or the default value.

        """
        if isinstance(annid, Annotation):
            annid = annid.id
        return self._annotations.get(annid, default)

    def first(self):
        """

        Args:

        Returns:
          :return: first annotation

        """
        sz = len(self._annotations)
        if sz == 0:
            raise Exception("Empty set, there is no first annotation")
        elif sz == 1:
            return next(iter(self._annotations.values()))
        self._create_index_by_offset()
        _, _, annid = next(self._index_by_offset.irange(reverse=False))
        return self._annotations[annid]

    def last(self):
        """

        Args:

        Returns:
          :return: first annotation

        """
        sz = len(self._annotations)
        if sz == 0:
            raise Exception("Empty set, there is no last annotation")
        elif sz == 1:
            return next(iter(self._annotations.values()))
        self._create_index_by_offset()
        _, _, annid = next(self._index_by_offset.irange(reverse=True))
        return self._annotations[annid]

    def __getitem__(self, annid):
        """
        Gets the annotation with the given annotation id or throws an exception.

        Args:
            annid: the annotation id

        Returns:
            annotation
        """
        return self._annotations[annid]

    def with_type(self, *anntype: Union[str, Iterable],
                  non_overlapping: bool = False) -> "AnnotationSet":
        """
        Gets annotations of the specified type(s).
        Creates the type index if necessary.

        Args:
          anntype: one or more types or type lists. The union of all types specified that way
            is used to filter the annotations. If no type is specified, all annotations are selected.

          non_overlapping: if True, only return annotations of any of the given types which
            do not overlap with other annotations. If there are several annotations that start at
            the same offset, use the type that comes first in the parameters, if there are more
            than one of that type, use the one that would come first in the usual sort order.

        Returns:
            a detached immutable annotation set with the matching annotations.
        """
        atypes = []
        for atype in anntype:
            if isinstance(atype, str):
                atypes.append(atype)
            else:
                for t in atype:
                    atypes.append(t)
        if not atypes:
            return self.detach()
        self._create_index_by_type()
        annids = set()
        for t in atypes:
            idxs = self._index_by_type.get(t)
            if idxs:
                annids.update(idxs)
        if non_overlapping:
            # need to get annotations grouped by start offset and sorted according to
            # what the Annotation class defines
            allanns = sorted(annids, key=lambda x: self._annotations[x])
            allanns = [self._annotations[x] for x in allanns]
            allannsgrouped = []
            curstart = None
            curset = None
            for ann in allanns:
                if curstart is None:
                    curset = [ann]
                    curstart = ann.start
                elif curstart == ann.start:
                    curset.append(ann)
                else:
                    allannsgrouped.append(curset)
                    curset = [ann]
                    curstart = ann.start
            if curset:
                allannsgrouped.append(curset)
            retanns = []
            # now go through all the grouped annoations and select the top priority one
            # then skip to the next group that does not overlap with the one we just selected
            typepriority = dict()
            for i, atype in enumerate(atypes):
                typepriority[atype] = len(atypes)-i
            curminoffset = 0
            for group in allannsgrouped:
                # instead of sorting, go through the group and find the top priority one
                topann = None
                if len(group) == 1:
                    if group[0].start >= curminoffset:
                        topann = group[0]
                elif len(group) == 0:
                    raise Exception("We should never get a 0 size group here!")
                else:
                    for i, ann in enumerate(group):
                        if ann.start >= curminoffset:
                            topann = ann
                            break
                    for ann in group[i+1:]:
                        if ann.start < curminoffset:
                            continue
                        if typepriority[ann.type] > typepriority[topann.type]:
                            topann = ann
                        elif typepriority[ann.type] == typepriority[topann.type]:
                            if ann.end > topann.end:
                                topann = ann
                            elif ann.end == topann.end:
                                if ann.id > topann.id:
                                    topann = ann
                if topann is not None:
                    retanns.append(topann)
                    curminoffset = topann.end
            annids = [ann.id for ann in retanns]
        return self.detach(restrict_to=annids)

    def by_offset(self):
        """
        Yields lists of annotations which start at the same offset.
        """
        self._create_index_by_offset()
        lastoff = -1
        curlist = []
        for ann in self.iter():
            if ann.start != lastoff:
                if lastoff != -1:
                    yield curlist
                lastoff = ann.start
                curlist = [ann]
            else:
                curlist.append(ann)
        if lastoff != -1:
            yield curlist

    def by_span(self):
        """
        Yields list of annotations with identical spans.
        """
        self._create_index_by_offset()
        lastsoff = -1
        lasteoff = -1
        curlist = []
        for ann in self.iter():
            if ann.start != lastsoff or ann.end != lasteoff:
                if lastsoff != -1:
                    yield curlist
                lastsoff = ann.start
                lasteoff = ann.end
                curlist = [ann]
            else:
                curlist.append(ann)
        if lastsoff != -1:
            yield curlist

    @property
    def type_names(self) -> KeysView[str]:
        """
        Gets the names of all types in this set. Creates the type index if necessary.
        """
        self._create_index_by_type()
        return self._index_by_type.keys()

    @support_annotation_or_set
    def start_eq(self, start: int, ignored: Any = None, annid=None, include_self=False) -> "AnnotationSet":
        """
        Gets all annotations starting at the given offset (empty if none) and returns them in a detached
        annotation set.

        Note: this can be called with an annotation or annotation set instead of the start offset. If called
        with an annotation, this annotation is not included in the result set if `include_self` is `False`

        Args:
          start: the offset where annotations should start
          ignored: dummy parameter to allow the use of annotations and annotation sets
          annid:  dummy parameter to allow the use of annotations and annotation sets
          include_self:  should annotation passed be included in the result

        Returns:
            detached annotation set of matching annotations
        """
        self._create_index_by_offset()
        intvs = self._index_by_offset.starting_from(start)
        if not include_self and annid is not None:
            ignore = annid
        else:
            ignore = None
        return self._restrict_intvs(intvs, ignore=ignore)

    @support_annotation_or_set
    def start_min_ge(self, offset: int, ignored: Any = None, annid=None, include_self=False) -> "AnnotationSet":
        """Gets all annotations starting at the first possible offset
        at or after the given offset and returns them in an immutable
        annotation set.

        Args:
          offset: The offset
          ignored: dummy parameter to allow the use of annotations and annotation sets
          annid:  annotation id
          include_self: should annotation passed be included in the result

        Returns:
          annotation set of matching annotations

        """
        self._create_index_by_offset()
        intvs = self._index_by_offset.starting_from(offset)
        # now select only those first ones which all have the same offset
        if not include_self and annid is not None:
            ignore = annid
        else:
            ignore = None
        retids = set()
        startoff = None
        for intv in intvs:
            if startoff is None:
                startoff = intv[0]
                if ignore is not None:
                    if ignore != intv[2]:
                        retids.add(intv[2])
                else:
                    retids.add(intv[2])
            elif startoff == intv[0]:
                if ignore is not None:
                    if ignore != intv[2]:
                        retids.add(intv[2])
                else:
                    retids.add(intv[2])
            else:
                break
        return self.detach(restrict_to=retids)

    @support_annotation_or_set
    def start_ge(self, start: int, ignored: Any = None, annid=None, include_self=False) -> "AnnotationSet":
        """Return the annotations that start at or after the given start offset.

        Args:
          start: Start offset
          ignored: dummy parameter to allow the use of annotations and annotation sets
          annid:  annotation id
          include_self:  should annotation passed be included in the result

        Returns:
          an immutable annotation set of the matching annotations

        """
        self._create_index_by_offset()
        intvs = self._index_by_offset.starting_from(start)
        if not include_self and annid is not None:
            ignore = annid
        else:
            ignore = None
        return self._restrict_intvs(intvs, ignore=ignore)

    @support_annotation_or_set
    def start_lt(self, offset: int, ignored: Any = None, annid=None) -> "AnnotationSet":
        """
        Returns the annotations that start before the given offset (or annotation). This also accepts an annotation
        or set.

        Args:
          offset: offset before which the annotations should start
          ignored: dummy parameter to allow the use of annotations and annotation sets
          annid:  annotation id

        Returns:
          an immutable annotation set of the matching annotations

        """
        self._create_index_by_offset()
        intvs = self._index_by_offset.starting_before(offset)
        return self._restrict_intvs(intvs)


    @support_annotation_or_set
    def overlapping(self, start: int, end: int, annid=None, include_self=False) -> "AnnotationSet":
        """
        Gets annotations overlapping with the given span. Instead of the start and end offsets,
        also accepts an annotation or annotation set.
        
        For each annotation ann in the result set, ann.overlapping(span) is True

        Args:
          start: start offset of the span
          end: end offset of the span
          annid: the annotation id of the annotation representing the span. (Default value = None)
          include_self: if True and the annotation id for the span is given, do not include that
            annotation in the result set. (Default value = False)

        Returns:
          an immutable annotation set with the matching annotations

        """
        self._create_index_by_offset()
        intvs = self._index_by_offset.overlapping(start, end)
        if not include_self and annid is not None:
            ignore = annid
        else:
            ignore = None
        return self._restrict_intvs(intvs, ignore=ignore)

    @support_annotation_or_set
    def covering(self, start: int, end: int, annid=None, include_self=False) -> "AnnotationSet":
        """
        Gets the annotations which contain the given offset range (or annotation/annotation set),
        i.e. annotations such that the given offset range is within the annotation.
        
        For each annotation ann in the result set, ann.covering(span) is True.

        Args:
          start: the start offset of the span
          end: the end offset of the span
          annid: the annotation id of the annotation representing the span. (Default value = None)
          include_self: if True and the annotation id for the span is given, do not include that
            annotation in the result set. (Default value = False)

        Returns:
          an immutable annotation set with the matching annotations, if any

        """
        self._create_index_by_offset()
        intvs = self._index_by_offset.covering(start, end)
        if not include_self and annid is not None:
            ignore = annid
        else:
            ignore = None
        return self._restrict_intvs(intvs, ignore=ignore)

    @support_annotation_or_set
    def within(self, start: int, end: int, annid=None, include_self=False) -> "AnnotationSet":
        """
        Gets annotations that fall completely within the given offset range, i.e. annotations
        such that the offset range is covering each of the annotation.
        
        For each annotation ann in the result set, ann.within(span) is True.

        Args:
          start: start offset of the range
          end: end offset of the range
          annid: the annotation id of the annotation representing the span. (Default value = None)
          include_self: if True and the annotation id for the span is given, do not include that
             annotation in the result set. (Default value = False)

        Returns:
          an immutable annotation set with the matching annotations

        """
        if start == end:
            intvs = []
        elif start > end:
            raise Exception("Invalid offset range: {},{}".format(start, end))
        else:
            self._create_index_by_offset()
            intvs = self._index_by_offset.within(start, end)
        if not include_self and annid is not None:
            ignore = annid
        else:
            ignore = None
        return self._restrict_intvs(intvs, ignore=ignore)

    @support_annotation_or_set
    def coextensive(self, start: int, end: int, annid=None, include_self=False) -> "AnnotationSet":
        """
        Returns a detached annotation set with all annotations that start and end at the given offsets.
        
        For each annotation ann in the result set, ann.coextensive(span) is True.

        Args:
          start: start offset of the span
          end: end offset of the span
          annid: the annotation id of the annotation representing the span. (Default value = None)
          include_self: if True and the annotation id for the span is given, do not include that
             annotation in the result set.

        Returns:
          annotation set with all annotations that have the same start and end offsets.
        """
        self._create_index_by_offset()
        intvs = self._index_by_offset.at(start, end)
        if not include_self and annid is not None:
            ignore = annid
        else:
            ignore = None
        return self._restrict_intvs(intvs, ignore=ignore)

    @property
    def span(self) -> Tuple[int, int]:
        """
        Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation
        and the largest end offset of any annotation.
        (Builds the offset index)
        """
        self._create_index_by_offset()
        return self._index_by_offset.min_start(), self._index_by_offset.max_end()

    def __contains__(self, annorannid: Union[int, Annotation]) -> bool:
        """
        Provides 'annotation in annotation_set' functionality

        Args:
            :param annorannid: the annotation instance or annotation id to check

        Returns:
            `True` if the annotation exists in the set, `False` otherwise
        """
        if isinstance(annorannid, Annotation):
            return annorannid.id in self._annotations
        return annorannid in self._annotations  # On the off chance someone passed an ID in directly

    contains = __contains__

    def __repr__(self) -> str:
        """
        Returns the string representation of the set.
        """
        return "AnnotationSet({})".format(repr(list(self.iter())))

    def to_dict(self, **kwargs):
        """
        Convert an annotation set to its dict representation.

        Args:
          **kwargs: passed on to the dict creation of contained annotations.

        Returns:
            the dict representation of the annotation set.
        """
        return {
            # NOTE: Changelog is not getting added as it is stored in the document part!
            "name": self.name,
            "annotations": list(val.to_dict(**kwargs)
                                for val in self._annotations.values()),
            "next_annid": self._next_annid,
        }

    @staticmethod
    def from_dict(dictrepr, owner_doc=None, **kwargs):
        """
        Create an AnnotationSet from its dict representation and optionally set the owning document.

        Args:
          dictrepr: the dict representation of the annotation set
          owner_doc:  the owning document
          **kwargs: passed on to the creation of annotations

        Returns:
            the annotation set
        """
        annset = AnnotationSet(dictrepr.get("name"), owner_doc=owner_doc)
        annset._next_annid = dictrepr.get("next_annid")
        if dictrepr.get("annotations"):
            annset._annotations = dict(
                (int(a["id"]), Annotation.from_dict(a, owner_set=annset, **kwargs))
                for a in dictrepr.get("annotations"))
        else:
            annset._annotations = {}
        return annset

Static methods

def from_dict(dictrepr, owner_doc=None, **kwargs)

Create an AnnotationSet from its dict representation and optionally set the owning document.

Args

dictrepr
the dict representation of the annotation set
owner_doc
the owning document
**kwargs
passed on to the creation of annotations

Returns

the annotation set

Expand source code
@staticmethod
def from_dict(dictrepr, owner_doc=None, **kwargs):
    """
    Create an AnnotationSet from its dict representation and optionally set the owning document.

    Args:
      dictrepr: the dict representation of the annotation set
      owner_doc:  the owning document
      **kwargs: passed on to the creation of annotations

    Returns:
        the annotation set
    """
    annset = AnnotationSet(dictrepr.get("name"), owner_doc=owner_doc)
    annset._next_annid = dictrepr.get("next_annid")
    if dictrepr.get("annotations"):
        annset._annotations = dict(
            (int(a["id"]), Annotation.from_dict(a, owner_set=annset, **kwargs))
            for a in dictrepr.get("annotations"))
    else:
        annset._annotations = {}
    return annset

Instance variables

property/get changelog

Returns the changelog or None if no changelog is set.

Expand source code
@property
def changelog(self):
    """
    Returns the changelog or None if no changelog is set.
    """
    if self._owner_doc is None:
        return None
    return self._owner_doc.changelog
property/get document : Union[_ForwardRef('Document'), NoneType]

Returns the owning document, if set. If the owning document was not set, returns None.

Expand source code
@property
def document(self) -> Union["Document", None]:
    """
    Returns the owning document, if set. If the owning document was not set, returns None.
    """
    return self._owner_doc
property/get end

Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation. This needs the index and creates it if necessary.

Throws

an exception if there are no annotations in the set.

Expand source code
@property
def end(self):
    """
    Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation.
    This needs the index and creates it if necessary.
    
    Throws:
      an exception if there are no annotations in the set.
    """
    if self.size == 0:
        raise Exception("Annotation set is empty, cannot determine end offset")
    self._create_index_by_offset()
    return self._index_by_offset.max_end()
property/get/set immutable : bool

Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added or removed from the set, but the annotations themselves can still have their features modified.

All detached annotation sets are immutable when created, but can be made mutable afterwards.

Expand source code
@property
def immutable(self) -> bool:
    """
    Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added
    or removed from the set, but the annotations themselves can still have their features modified.

    All detached annotation sets are immutable when created, but can be made mutable afterwards.
    """
    return self._is_immutable
property/get length

Returns the the length of the annotation set span.

Throws

an exception if there are no annotations in the set.

Expand source code
@property
def length(self):
    """
    Returns the the length of the annotation set span.

    Throws:
      an exception if there are no annotations in the set.
    """
    return self.end() - self.start()
property/get name

Returns the name of the annotation set.

Note: the name of a set cannot be changed.

Expand source code
@property
def name(self):
    """
    Returns the name of the annotation set.

    Note: the name of a set cannot be changed.
    """
    return self._name
property/get size : int

Returns the number of annotations in the annotation set.

Expand source code
@property
def size(self) -> int:
    """
    Returns the number of annotations in the annotation set.
    """
    return len(self._annotations)
property/get span : Tuple[int, int]

Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation and the largest end offset of any annotation. (Builds the offset index)

Expand source code
@property
def span(self) -> Tuple[int, int]:
    """
    Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation
    and the largest end offset of any annotation.
    (Builds the offset index)
    """
    self._create_index_by_offset()
    return self._index_by_offset.min_start(), self._index_by_offset.max_end()
property/get start

Returns the smallest start offset of all annotations, i.e the start of the span of the whole set. This needs the index and creates it if necessary.

Throws

an exception if there are no annotations in the set.

Expand source code
@property
def start(self):
    """
    Returns the smallest start offset of all annotations, i.e the start of the span of the whole set.
    This needs the index and creates it if necessary.

    Throws:
      an exception if there are no annotations in the set.
    """
    if self.size == 0:
        raise Exception("Annotation set is empty, cannot determine start offset")
    self._create_index_by_offset()
    return self._index_by_offset.min_start()
property/get type_names : KeysView[str]

Gets the names of all types in this set. Creates the type index if necessary.

Expand source code
@property
def type_names(self) -> KeysView[str]:
    """
    Gets the names of all types in this set. Creates the type index if necessary.
    """
    self._create_index_by_type()
    return self._index_by_type.keys()

Methods

def add(self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None)

Adds an annotation to the set. Once an annotation has been added, the start and end offsets, the type, and the annotation id of the annotation are immutable.

Args

start
start offset
end
end offset
anntype
the annotation type
features
a map, an iterable of tuples or an existing feature map. In any case, the features are used to create a new feature map for this annotation. If the map is empty or this parameter is None, the annotation does not store any map at all.
annid
the annotation id, if not specified the next free one for this set is used. NOTE: the id should normally left unspecified and get assigned automatically.

Returns

the new annotation

Expand source code
def add(self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None):
    """
    Adds an annotation to the set. Once an annotation has been added, the start and end offsets,
    the type, and the annotation id of the annotation are immutable.

    Args:
      start: start offset
      end: end offset
      anntype: the annotation type
      features: a map, an iterable of tuples or an existing feature map. In any case, the features are used
        to create a new feature map for this annotation. If the map is empty or this parameter is None, the
        annotation does not store any map at all.
      annid: the annotation id, if not specified the next free one for this set is used. NOTE: the id should
        normally left unspecified and get assigned automatically.

    Returns:
        the new annotation
    """
    if annid is not None and not isinstance(annid, int):
        raise Exception("Parameter annid must be an int, mixed up with features?")
    if features is not None and isinstance(features, int):
        raise Exception("Parameter features must not be an int: mixed up with annid?")
    if self._is_immutable:
        raise Exception("Cannot add an annotation to an immutable annotation set")
    self._check_offsets(start, end)
    if annid and annid in self._annotations:
        raise Exception("Cannot add annotation with id {}, already in set".format(annid))
    if annid is None:
        annid = self._next_annid
        self._next_annid = self._next_annid + 1
    ann = Annotation(start, end, anntype, features=features, annid=annid)
    ann._owner_set = self
    if not self._annotations:
        self._annotations = {}
    self._annotations[annid] = ann
    self._add_to_indices(ann)
    if self.changelog is not None:
        entry = {
                "command": "annotation:add",
                "set": self.name,
                "start": ann.start,
                "end": ann.end,
                "type": ann.type,
                "features": ann._features.to_dict(),
                "id": ann.id
            }
        self.changelog.append(entry)
    return ann
def add_ann(self, ann, annid: int = None)

Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or with the one given.

Args

ann
the annotation to copy into the set
annid
the annotation id, if not specified the next free one for this set is used. Note: the id should normally left unspecified and get assigned automatically.

Returns

the added annotation

Expand source code
def add_ann(self, ann, annid: int = None):
    """
    Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or
    with the one given.

    Args:
      ann: the annotation to copy into the set
      annid: the annotation id, if not specified the next free one for this set is used. Note:
         the id should normally left unspecified and get assigned automatically.

    Returns:
      the added annotation
    """
    return self.add(ann.start, ann.end, ann.type, ann.features, annid=annid)
def by_offset(self)

Yields lists of annotations which start at the same offset.

Expand source code
def by_offset(self):
    """
    Yields lists of annotations which start at the same offset.
    """
    self._create_index_by_offset()
    lastoff = -1
    curlist = []
    for ann in self.iter():
        if ann.start != lastoff:
            if lastoff != -1:
                yield curlist
            lastoff = ann.start
            curlist = [ann]
        else:
            curlist.append(ann)
    if lastoff != -1:
        yield curlist
def by_span(self)

Yields list of annotations with identical spans.

Expand source code
def by_span(self):
    """
    Yields list of annotations with identical spans.
    """
    self._create_index_by_offset()
    lastsoff = -1
    lasteoff = -1
    curlist = []
    for ann in self.iter():
        if ann.start != lastsoff or ann.end != lasteoff:
            if lastsoff != -1:
                yield curlist
            lastsoff = ann.start
            lasteoff = ann.end
            curlist = [ann]
        else:
            curlist.append(ann)
    if lastsoff != -1:
        yield curlist
def clear(self) ‑> NoneType

Removes all annotations from the set.

Expand source code
def clear(self) -> None:
    """
    Removes all annotations from the set.
    """
    self._annotations.clear()
    self._index_by_offset = None
    self._index_by_type = None
    if self.changelog is not None:
        self.changelog.append({
            "command": "annotations:clear",
            "set": self.name})
def clone_anns(self, memo=None)

Replaces the annotations in this set with deep copies of the originals. If this is a detached set, then this makes sure that any modifications to the annotations do not affect the original annotations in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect the annotations in this set any more. The owning set of the annotations that get cloned is cleared.

Args

memo
for internal use by our deepcopy implementation.
Expand source code
def clone_anns(self, memo=None):
    """
    Replaces the annotations in this set with deep copies of the originals. If this is a detached set,
    then this makes sure that any modifications to the annotations do not affect the original annotations
    in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect
    the annotations in this set any more. The owning set of the annotations that get cloned is cleared.

    Args:
      memo: for internal use by our __deepcopy__ implementation.
    """
    tmpdict = {}
    for annid, ann in self._annotations.items():
        newann = copy.deepcopy(ann, memo=memo)
        ann._owner_set = None
        tmpdict[annid] = newann
    for annid, ann in tmpdict.items():
        self._annotations[annid] = ann
def coextensive(self, start: int, end: int, annid=None, include_self=False) ‑> AnnotationSet

Returns a detached annotation set with all annotations that start and end at the given offsets.

For each annotation ann in the result set, ann.coextensive(span) is True.

Args

start
start offset of the span
end
end offset of the span
annid
the annotation id of the annotation representing the span. (Default value = None)
include_self
if True and the annotation id for the span is given, do not include that annotation in the result set.

Returns

annotation set with all annotations that have the same start and end offsets.

Expand source code
@support_annotation_or_set
def coextensive(self, start: int, end: int, annid=None, include_self=False) -> "AnnotationSet":
    """
    Returns a detached annotation set with all annotations that start and end at the given offsets.
    
    For each annotation ann in the result set, ann.coextensive(span) is True.

    Args:
      start: start offset of the span
      end: end offset of the span
      annid: the annotation id of the annotation representing the span. (Default value = None)
      include_self: if True and the annotation id for the span is given, do not include that
         annotation in the result set.

    Returns:
      annotation set with all annotations that have the same start and end offsets.
    """
    self._create_index_by_offset()
    intvs = self._index_by_offset.at(start, end)
    if not include_self and annid is not None:
        ignore = annid
    else:
        ignore = None
    return self._restrict_intvs(intvs, ignore=ignore)
def contains(self, annorannid: Union[int, Annotation]) ‑> bool

Provides 'annotation in annotation_set' functionality

Args

:param annorannid: the annotation instance or annotation id to check

Returns

True if the annotation exists in the set, False otherwise

Expand source code
def __contains__(self, annorannid: Union[int, Annotation]) -> bool:
    """
    Provides 'annotation in annotation_set' functionality

    Args:
        :param annorannid: the annotation instance or annotation id to check

    Returns:
        `True` if the annotation exists in the set, `False` otherwise
    """
    if isinstance(annorannid, Annotation):
        return annorannid.id in self._annotations
    return annorannid in self._annotations  # On the off chance someone passed an ID in directly
def copy(self)

Returns a shallow copy of the annotation set.

Expand source code
def copy(self):
    """
    Returns a shallow copy of the annotation set.
    """
    return self.__copy__()
def covering(self, start: int, end: int, annid=None, include_self=False) ‑> AnnotationSet

Gets the annotations which contain the given offset range (or annotation/annotation set), i.e. annotations such that the given offset range is within the annotation.

For each annotation ann in the result set, ann.covering(span) is True.

Args

start
the start offset of the span
end
the end offset of the span
annid
the annotation id of the annotation representing the span. (Default value = None)
include_self
if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False)

Returns

an immutable annotation set with the matching annotations, if any

Expand source code
@support_annotation_or_set
def covering(self, start: int, end: int, annid=None, include_self=False) -> "AnnotationSet":
    """
    Gets the annotations which contain the given offset range (or annotation/annotation set),
    i.e. annotations such that the given offset range is within the annotation.
    
    For each annotation ann in the result set, ann.covering(span) is True.

    Args:
      start: the start offset of the span
      end: the end offset of the span
      annid: the annotation id of the annotation representing the span. (Default value = None)
      include_self: if True and the annotation id for the span is given, do not include that
        annotation in the result set. (Default value = False)

    Returns:
      an immutable annotation set with the matching annotations, if any

    """
    self._create_index_by_offset()
    intvs = self._index_by_offset.covering(start, end)
    if not include_self and annid is not None:
        ignore = annid
    else:
        ignore = None
    return self._restrict_intvs(intvs, ignore=ignore)
def deepcopy(self)

Returns a deep copy of the annotation set.

Expand source code
def deepcopy(self):
    """
    Returns a deep copy of the annotation set.
    """
    return copy.deepcopy(self)
def detach(self, restrict_to=None) ‑> AnnotationSet

Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids. A detached annotation set does not have an owning document and deleting or adding annotations does not change the annotations stored with the document. However, the annotations in a detached annotation set are the same as those stored in the attached set, so updating their features will modify the annotations in the document as well.

Args

restrict_to
an iterable of annotation ids, if None, all the annotations from this set.

Returns

an immutable annotation set

Expand source code
def detach(self, restrict_to=None) -> "AnnotationSet":
    """
    Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids.
    A detached annotation set does not have an owning document and deleting or adding annotations does not
    change the annotations stored with the document. However, the annotations in a detached annotation set
    are the same as those stored in the attached set, so updating their features will modify the annotations
    in the document as well.

    Args:
      restrict_to: an iterable of annotation ids, if None, all the annotations from this set.

    Returns:
      an immutable annotation set
    """
    annset = AnnotationSet(name="detached-from:"+self.name)
    annset._is_immutable = True
    if restrict_to is None:
        annset._annotations = {annid: self._annotations[annid] for annid in self._annotations.keys()}
    else:
        annset._annotations = {annid: self._annotations[annid] for annid in restrict_to}
    annset._next_annid = self._next_annid
    return annset
def detach_from(self, anns: collections.abc.Iterable) ‑> AnnotationSet

Creates an immutable detached annotation set from the annotations in anns which could by either a collection of annotations or annotation ids (int numbers) which are assumed to be the annotation ids from this set.

The next annotation id for the created set is the highest seen annotation id from anns plus one.

Args

anns
an iterable of annotations

Returns

an immutable detached annotation set

Expand source code
def detach_from(self, anns: Iterable) -> "AnnotationSet":
    """
    Creates an immutable detached annotation set from the annotations in anns which could by
    either a collection of annotations or annotation ids (int numbers) which are assumed to
    be the annotation ids from this set.
    
    The next annotation id for the created set is the highest seen annotation id from anns plus one.

    Args:
      anns: an iterable of annotations

    Returns:
      an immutable detached annotation set
    """
    annset = AnnotationSet(name="detached-from:"+self.name)
    annset._is_immutable = True
    annset._annotations = {}
    nextid = -1
    for ann in anns:
        if isinstance(ann, int):
            annset._annotations[ann] = self._annotations[ann]
            annid = ann
        else:
            annset._annotations[id] = ann
            annid = ann.id
        if annid > nextid:
            nextid = annid
    annset._next_annid = nextid + 1
    return annset
def fast_iter(self) ‑> Generator

Yields annotations in insertion order. This is faster then the default iterator and does not need to index (so if the index does not exist, it will not be built).

Expand source code
def fast_iter(self) -> Generator:
    """
    Yields annotations in insertion order. This is faster then the default iterator and does not
    need to index (so if the index does not exist, it will not be built).
    """
    if self._annotations:
        for annid, ann in self._annotations.items():
            yield ann
def first(self)

Args:

Returns

:return: first annotation

Expand source code
def first(self):
    """

    Args:

    Returns:
      :return: first annotation

    """
    sz = len(self._annotations)
    if sz == 0:
        raise Exception("Empty set, there is no first annotation")
    elif sz == 1:
        return next(iter(self._annotations.values()))
    self._create_index_by_offset()
    _, _, annid = next(self._index_by_offset.irange(reverse=False))
    return self._annotations[annid]
def get(self, annid: Union[int, Annotation], default=None) ‑> Union[Annotation, NoneType]

Gets the annotation with the given annotation id or returns the given default.

NOTE: for handling cases where legacy code still expects the add method to return an id and not the annotation, this will accept an annotation so the the frequent pattern still works:

annid = annset.add(b,e,t).id ann = annset.get(annid)

If an annotation is passed the annotation from the set with the id of that annotation is returned, if the annotation is from that set, this will return the same object, if it is still in the set (or return the default value).

Args

annid
the annotation id of the annotation to retrieve.
default
what to return if an annotation with the given id is not found. (Default value = None)
annid
Union[int:

Annotation]:

Returns

the annotation or the default value.

Expand source code
def get(self, annid: Union[int, Annotation], default=None) -> Union[Annotation, None]:
    """Gets the annotation with the given annotation id or returns the given default.
    
    NOTE: for handling cases where legacy code still expects the add method to return
    an id and not the annotation, this will accept an annotation so the the frequent
    pattern still works:
    
       annid = annset.add(b,e,t).id
       ann = annset.get(annid)
    
    If an annotation is passed the annotation from the set with the id of that annotation is
    returned, if the annotation is from that set, this will return the same object, if it is
    still in the set (or return the default value).

    Args:
      annid: the annotation id of the annotation to retrieve.
      default: what to return if an annotation with the given id is not found. (Default value = None)
      annid: Union[int: 
      Annotation]: 

    Returns:
      the annotation or the default value.

    """
    if isinstance(annid, Annotation):
        annid = annid.id
    return self._annotations.get(annid, default)
def isdetached(self) ‑> bool

Returns True if the annotation set is detached, False otherwise.

Expand source code
def isdetached(self) -> bool:
    """
    Returns True if the annotation set is detached, False otherwise.
    """
    return self._owner_doc is None
def iter(self, start_ge: Union[int, NoneType] = None, start_lt: Union[NoneType, int] = None, with_type: str = None, reverse: bool = False) ‑> Generator

Yields annotations in document order, otionally limited by the other parameters. If two annoations start at the same offset, they are always ordered by increasing annotation id.

Args

start_ge
the offset from where to start including annotations
start_lt
the last offset to use as the starting offset of an annotation
with_type
only annotations of this type
reverse
process in reverse document order

Yields

annotations in document order

Expand source code
def iter(self,
         start_ge: Union[int, None] = None,
         start_lt: Union[None, int] = None,
         with_type: str = None,
         reverse: bool = False) -> Generator:
    """
    Yields annotations in document order, otionally limited
    by the other parameters. If two annoations start at the same offset, they are always
    ordered by increasing annotation id.

    Args:
      start_ge: the offset from where to start including annotations
      start_lt: the last offset to use as the starting offset of an annotation
      with_type: only annotations of this type
      reverse: process in reverse document order

    Yields:
      annotations in document order

    """

    if with_type is not None:
        allowedtypes = set()
        if isinstance(type, str):
            allowedtypes.add(with_type)
        else:
            for atype in with_type:
                allowedtypes.add(atype)
    else:
        allowedtypes = None
    if not self._annotations:
        return
    maxoff = None
    if start_ge is not None:
        assert start_ge >= 0
    if start_lt is not None:
        assert start_lt >= 1
        maxoff = start_lt + 1
    if start_lt is not None and start_ge is not None:
        assert start_lt > start_ge
    self._create_index_by_offset()
    for _start, _end, annid in self._index_by_offset.irange(minoff=start_ge, maxoff=maxoff, reverse=reverse):
        if allowedtypes is not None and self._annotations[annid].type not in allowedtypes:
            continue
        yield self._annotations[annid]
def last(self)

Args:

Returns

:return: first annotation

Expand source code
def last(self):
    """

    Args:

    Returns:
      :return: first annotation

    """
    sz = len(self._annotations)
    if sz == 0:
        raise Exception("Empty set, there is no last annotation")
    elif sz == 1:
        return next(iter(self._annotations.values()))
    self._create_index_by_offset()
    _, _, annid = next(self._index_by_offset.irange(reverse=True))
    return self._annotations[annid]
def overlapping(self, start: int, end: int, annid=None, include_self=False) ‑> AnnotationSet

Gets annotations overlapping with the given span. Instead of the start and end offsets, also accepts an annotation or annotation set.

For each annotation ann in the result set, ann.overlapping(span) is True

Args

start
start offset of the span
end
end offset of the span
annid
the annotation id of the annotation representing the span. (Default value = None)
include_self
if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False)

Returns

an immutable annotation set with the matching annotations

Expand source code
@support_annotation_or_set
def overlapping(self, start: int, end: int, annid=None, include_self=False) -> "AnnotationSet":
    """
    Gets annotations overlapping with the given span. Instead of the start and end offsets,
    also accepts an annotation or annotation set.
    
    For each annotation ann in the result set, ann.overlapping(span) is True

    Args:
      start: start offset of the span
      end: end offset of the span
      annid: the annotation id of the annotation representing the span. (Default value = None)
      include_self: if True and the annotation id for the span is given, do not include that
        annotation in the result set. (Default value = False)

    Returns:
      an immutable annotation set with the matching annotations

    """
    self._create_index_by_offset()
    intvs = self._index_by_offset.overlapping(start, end)
    if not include_self and annid is not None:
        ignore = annid
    else:
        ignore = None
    return self._restrict_intvs(intvs, ignore=ignore)
def remove(self, annotation: Union[int, Annotation]) ‑> NoneType

Removes the given annotation which is either the id or the annotation instance.

Throws

exception if the annotation set is immutable or the annotation is not in the set

Args

annotation
either the id (int) or the annotation instance (Annotation)
Expand source code
def remove(self, annotation: Union[int, Annotation]) -> None:
    """
    Removes the given annotation which is either the id or the annotation instance.

    Throws:
        exception if the annotation set is immutable or the annotation is not in the set

    Args:
      annotation: either the id (int) or the annotation instance (Annotation)
    """
    annid = None  # make pycharm happy
    if self._is_immutable:
        raise Exception("Cannot remove an annotation from an immutable annotation set")
    if isinstance(annotation, int):
        annid = annotation
        if annid not in self._annotations:
            raise Exception("Annotation with id {} not in annotation set, cannot remove".format(annid))
        annotation = self._annotations[annid]
    elif isinstance(annotation, Annotation):
        annid = annotation.id
        if annid not in self._annotations:
            raise Exception("Annotation with id {} does not belong to this set, cannot remove".format(annid))
    # NOTE: once the annotation has been removed from the set, it could still be referenced
    # somewhere else and its features could get modified. In order to prevent logging of such changes,
    # the owning set gets cleared for the annotation
    annotation._owner_set = None
    del self._annotations[annid]
    if self.changelog is not None:
        self.changelog.append({
            "command": "annotation:remove",
            "set": self.name,
            "id": annid})
    self._remove_from_indices(annotation)
def reverse_iter(self, **kwargs)

Same as iter, but with the reverse parameter set to true.

Args

kwargs
Same as for iter(), with revers=True fixed.
**kwargs
will get passed on the Annotation.iter

Returns

same result as iter()

Expand source code
def reverse_iter(self, **kwargs):
    """
    Same as iter, but with the reverse parameter set to true.

    Args:
      kwargs: Same as for iter(), with revers=True fixed.
      **kwargs: will get passed on the Annotation.iter

    Returns:
      same result as iter()

    """
    return self.iter(reverse=True, **kwargs)
def start_eq(self, start: int, ignored: Any = None, annid=None, include_self=False) ‑> AnnotationSet

Gets all annotations starting at the given offset (empty if none) and returns them in a detached annotation set.

Note: this can be called with an annotation or annotation set instead of the start offset. If called with an annotation, this annotation is not included in the result set if include_self is False

Args

start
the offset where annotations should start
ignored
dummy parameter to allow the use of annotations and annotation sets
annid
dummy parameter to allow the use of annotations and annotation sets
include_self
should annotation passed be included in the result

Returns

detached annotation set of matching annotations

Expand source code
@support_annotation_or_set
def start_eq(self, start: int, ignored: Any = None, annid=None, include_self=False) -> "AnnotationSet":
    """
    Gets all annotations starting at the given offset (empty if none) and returns them in a detached
    annotation set.

    Note: this can be called with an annotation or annotation set instead of the start offset. If called
    with an annotation, this annotation is not included in the result set if `include_self` is `False`

    Args:
      start: the offset where annotations should start
      ignored: dummy parameter to allow the use of annotations and annotation sets
      annid:  dummy parameter to allow the use of annotations and annotation sets
      include_self:  should annotation passed be included in the result

    Returns:
        detached annotation set of matching annotations
    """
    self._create_index_by_offset()
    intvs = self._index_by_offset.starting_from(start)
    if not include_self and annid is not None:
        ignore = annid
    else:
        ignore = None
    return self._restrict_intvs(intvs, ignore=ignore)
def start_ge(self, start: int, ignored: Any = None, annid=None, include_self=False) ‑> AnnotationSet

Return the annotations that start at or after the given start offset.

Args

start
Start offset
ignored
dummy parameter to allow the use of annotations and annotation sets
annid
annotation id
include_self
should annotation passed be included in the result

Returns

an immutable annotation set of the matching annotations

Expand source code
@support_annotation_or_set
def start_ge(self, start: int, ignored: Any = None, annid=None, include_self=False) -> "AnnotationSet":
    """Return the annotations that start at or after the given start offset.

    Args:
      start: Start offset
      ignored: dummy parameter to allow the use of annotations and annotation sets
      annid:  annotation id
      include_self:  should annotation passed be included in the result

    Returns:
      an immutable annotation set of the matching annotations

    """
    self._create_index_by_offset()
    intvs = self._index_by_offset.starting_from(start)
    if not include_self and annid is not None:
        ignore = annid
    else:
        ignore = None
    return self._restrict_intvs(intvs, ignore=ignore)
def start_lt(self, offset: int, ignored: Any = None, annid=None) ‑> AnnotationSet

Returns the annotations that start before the given offset (or annotation). This also accepts an annotation or set.

Args

offset
offset before which the annotations should start
ignored
dummy parameter to allow the use of annotations and annotation sets
annid
annotation id

Returns

an immutable annotation set of the matching annotations

Expand source code
@support_annotation_or_set
def start_lt(self, offset: int, ignored: Any = None, annid=None) -> "AnnotationSet":
    """
    Returns the annotations that start before the given offset (or annotation). This also accepts an annotation
    or set.

    Args:
      offset: offset before which the annotations should start
      ignored: dummy parameter to allow the use of annotations and annotation sets
      annid:  annotation id

    Returns:
      an immutable annotation set of the matching annotations

    """
    self._create_index_by_offset()
    intvs = self._index_by_offset.starting_before(offset)
    return self._restrict_intvs(intvs)
def start_min_ge(self, offset: int, ignored: Any = None, annid=None, include_self=False) ‑> AnnotationSet

Gets all annotations starting at the first possible offset at or after the given offset and returns them in an immutable annotation set.

Args

offset
The offset
ignored
dummy parameter to allow the use of annotations and annotation sets
annid
annotation id
include_self
should annotation passed be included in the result

Returns

annotation set of matching annotations

Expand source code
@support_annotation_or_set
def start_min_ge(self, offset: int, ignored: Any = None, annid=None, include_self=False) -> "AnnotationSet":
    """Gets all annotations starting at the first possible offset
    at or after the given offset and returns them in an immutable
    annotation set.

    Args:
      offset: The offset
      ignored: dummy parameter to allow the use of annotations and annotation sets
      annid:  annotation id
      include_self: should annotation passed be included in the result

    Returns:
      annotation set of matching annotations

    """
    self._create_index_by_offset()
    intvs = self._index_by_offset.starting_from(offset)
    # now select only those first ones which all have the same offset
    if not include_self and annid is not None:
        ignore = annid
    else:
        ignore = None
    retids = set()
    startoff = None
    for intv in intvs:
        if startoff is None:
            startoff = intv[0]
            if ignore is not None:
                if ignore != intv[2]:
                    retids.add(intv[2])
            else:
                retids.add(intv[2])
        elif startoff == intv[0]:
            if ignore is not None:
                if ignore != intv[2]:
                    retids.add(intv[2])
            else:
                retids.add(intv[2])
        else:
            break
    return self.detach(restrict_to=retids)
def to_dict(self, **kwargs)

Convert an annotation set to its dict representation.

Args

**kwargs
passed on to the dict creation of contained annotations.

Returns

the dict representation of the annotation set.

Expand source code
def to_dict(self, **kwargs):
    """
    Convert an annotation set to its dict representation.

    Args:
      **kwargs: passed on to the dict creation of contained annotations.

    Returns:
        the dict representation of the annotation set.
    """
    return {
        # NOTE: Changelog is not getting added as it is stored in the document part!
        "name": self.name,
        "annotations": list(val.to_dict(**kwargs)
                            for val in self._annotations.values()),
        "next_annid": self._next_annid,
    }
def with_type(self, *anntype: collections.abc.Iterable, non_overlapping: bool = False) ‑> AnnotationSet

Gets annotations of the specified type(s). Creates the type index if necessary.

Args

anntype
one or more types or type lists. The union of all types specified that way is used to filter the annotations. If no type is specified, all annotations are selected.
non_overlapping
if True, only return annotations of any of the given types which do not overlap with other annotations. If there are several annotations that start at the same offset, use the type that comes first in the parameters, if there are more than one of that type, use the one that would come first in the usual sort order.

Returns

a detached immutable annotation set with the matching annotations.

Expand source code
def with_type(self, *anntype: Union[str, Iterable],
              non_overlapping: bool = False) -> "AnnotationSet":
    """
    Gets annotations of the specified type(s).
    Creates the type index if necessary.

    Args:
      anntype: one or more types or type lists. The union of all types specified that way
        is used to filter the annotations. If no type is specified, all annotations are selected.

      non_overlapping: if True, only return annotations of any of the given types which
        do not overlap with other annotations. If there are several annotations that start at
        the same offset, use the type that comes first in the parameters, if there are more
        than one of that type, use the one that would come first in the usual sort order.

    Returns:
        a detached immutable annotation set with the matching annotations.
    """
    atypes = []
    for atype in anntype:
        if isinstance(atype, str):
            atypes.append(atype)
        else:
            for t in atype:
                atypes.append(t)
    if not atypes:
        return self.detach()
    self._create_index_by_type()
    annids = set()
    for t in atypes:
        idxs = self._index_by_type.get(t)
        if idxs:
            annids.update(idxs)
    if non_overlapping:
        # need to get annotations grouped by start offset and sorted according to
        # what the Annotation class defines
        allanns = sorted(annids, key=lambda x: self._annotations[x])
        allanns = [self._annotations[x] for x in allanns]
        allannsgrouped = []
        curstart = None
        curset = None
        for ann in allanns:
            if curstart is None:
                curset = [ann]
                curstart = ann.start
            elif curstart == ann.start:
                curset.append(ann)
            else:
                allannsgrouped.append(curset)
                curset = [ann]
                curstart = ann.start
        if curset:
            allannsgrouped.append(curset)
        retanns = []
        # now go through all the grouped annoations and select the top priority one
        # then skip to the next group that does not overlap with the one we just selected
        typepriority = dict()
        for i, atype in enumerate(atypes):
            typepriority[atype] = len(atypes)-i
        curminoffset = 0
        for group in allannsgrouped:
            # instead of sorting, go through the group and find the top priority one
            topann = None
            if len(group) == 1:
                if group[0].start >= curminoffset:
                    topann = group[0]
            elif len(group) == 0:
                raise Exception("We should never get a 0 size group here!")
            else:
                for i, ann in enumerate(group):
                    if ann.start >= curminoffset:
                        topann = ann
                        break
                for ann in group[i+1:]:
                    if ann.start < curminoffset:
                        continue
                    if typepriority[ann.type] > typepriority[topann.type]:
                        topann = ann
                    elif typepriority[ann.type] == typepriority[topann.type]:
                        if ann.end > topann.end:
                            topann = ann
                        elif ann.end == topann.end:
                            if ann.id > topann.id:
                                topann = ann
            if topann is not None:
                retanns.append(topann)
                curminoffset = topann.end
        annids = [ann.id for ann in retanns]
    return self.detach(restrict_to=annids)
def within(self, start: int, end: int, annid=None, include_self=False) ‑> AnnotationSet

Gets annotations that fall completely within the given offset range, i.e. annotations such that the offset range is covering each of the annotation.

For each annotation ann in the result set, ann.within(span) is True.

Args

start
start offset of the range
end
end offset of the range
annid
the annotation id of the annotation representing the span. (Default value = None)
include_self
if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False)

Returns

an immutable annotation set with the matching annotations

Expand source code
@support_annotation_or_set
def within(self, start: int, end: int, annid=None, include_self=False) -> "AnnotationSet":
    """
    Gets annotations that fall completely within the given offset range, i.e. annotations
    such that the offset range is covering each of the annotation.
    
    For each annotation ann in the result set, ann.within(span) is True.

    Args:
      start: start offset of the range
      end: end offset of the range
      annid: the annotation id of the annotation representing the span. (Default value = None)
      include_self: if True and the annotation id for the span is given, do not include that
         annotation in the result set. (Default value = False)

    Returns:
      an immutable annotation set with the matching annotations

    """
    if start == end:
        intvs = []
    elif start > end:
        raise Exception("Invalid offset range: {},{}".format(start, end))
    else:
        self._create_index_by_offset()
        intvs = self._index_by_offset.within(start, end)
    if not include_self and annid is not None:
        ignore = annid
    else:
        ignore = None
    return self._restrict_intvs(intvs, ignore=ignore)
class ChangeLog (store=True)

Creates a ChangeLog.

A ChangeLog stores a log of all changes applied to a document. That log can be used to recreate the document from its initial version in a different process or at a later time.

Args

store
if True, the change log stores the actions it receives (default). This can be set

to false if only callbacks are needed.

Expand source code
class ChangeLog:
    def __init__(self, store=True):
        """
        Creates a ChangeLog.

        A ChangeLog stores a log of all changes applied to a document. That log can be used to recreate
        the document from its initial version in a different process or at a later time.

        Args:
            store: if `True`, the change log stores the actions it receives (default). This can be set
            to false if only callbacks are needed.
        """
        self.changes = []
        self.offset_type = OFFSET_TYPE_PYTHON
        self._handlers = dict()
        self._store = store

    def add_handler(self, actions, handler):
        """
        Registers a handler to get called back when any of the actions is added.
        If any handler was already registered for one or more of the actions,
        the new handler overrides it.

        Args:
          actions: either a single action string or a collection of several action strings
          handler: a callable that takes the change information
        """
        if isinstance(actions, str):
            actions = [actions]
        for a in actions:
            if a not in ACTIONS:
                raise Exception(f"Action {a} not known, cannot add handler")
            self._handlers[a] = handler

    def append(self, change: Dict):
        """
        Add a change to the change log. The change must be represented as a dictionary which follows the
        conventions of how to represent changes. This is not using an abstraction yet.

        Args:
          change: dict describing the action/modification
        """
        assert isinstance(change, dict)
        action = change.get("command",None)
        if action is None:
            raise Exception("Odd change, does not have 'command' key")
        if self._store:
            self.changes.append(change)
        hndlr = self._handlers.get(action)
        if hndlr:
            hndlr()

    def __len__(self) -> int:
        """
        Returns the number of actions logged in the ChangeLog.
        """
        return len(self.changes)

    def _fixup_changes(self, method: Callable, replace=False) -> List[Dict]:
        """In-place modify the annotation offsets of the changes according to
        the given method.

        Args:
          method: an object method method for converting offsets from or to python.
          replace: if True, modifies the original change objects in the changelog, otherwise, uses copies (Default value = False)
          method: Callable: 

        Returns:
          the modified changes, a reference to the modified changes list of the instance

        """
        if not replace:
            newchanges = []
        for change in self.changes:
            if not replace:
                chg = dict(change)
            else:
                chg = change
            if "start" in change:
                chg["start"] = method(change["start"])
            if "end" in change:
                chg["end"] = method(change["end"])
            if not replace:
                newchanges.append(chg)
        if replace:
            return self.changes
        else:
            return newchanges

    def fixup_changes(self, offset_mapper, offset_type, replace=True):
        """Update the offsets of all annotations in this changelog to the desired
        offset type, if necessary. If the ChangeLog already has that offset type, this does nothing.

        Args:
          offset_mapper: a prepared offset mapper to use
          offset_type: the desired offset type
          replace: if True, replaces the original offsets in the original change objects, otherwise creates
        new change objects and a new changes list and returs it. (Default value = True)

        Returns:
          a reference to the modified changes

        """
        if offset_type != self.offset_type:
            if offset_type == OFFSET_TYPE_JAVA:
                method = offset_mapper.convert_to_java
            elif offset_type == OFFSET_TYPE_PYTHON:
                method = offset_mapper.convert_to_python
            else:
                raise Exception("Not a proper offset type: {}".format(offset_type))
            if replace:
                self.offset_type = offset_type
            return self._fixup_changes(method, replace=replace)
        else:
            return self.changes

    def __repr__(self) -> str:
        return "ChangeLog([{}])".format(",".join([str(c) for c in self.changes]))

    def format_to(self, fp, prefix="") -> None:
        """
        Prints the log to the given stream.

        Args:
          fp: stream to print to
          prefix:  something to print in front of each action, default=""
        """
        for c in self.changes:
            print(prefix, str(c), sep="", file=fp)

    def to_dict(self, **kwargs):
        """
        Returns a dict representation of the ChangeLog.

        Args:
          **kwargs: ignored
        """
        offset_type = self.offset_type
        changes = self.changes
        if "offset_type" in kwargs and kwargs["offset_type"] != offset_type:
            om = kwargs.get("offset_mapper")
            if om is None:
                raise Exception("Need to convert offsets, but no offset_mapper parameter given")
            offset_type = kwargs["offset_type"]
            if offset_type == OFFSET_TYPE_JAVA:
                changes = self._fixup_changes(om.convert_to_java, replace=False)
            else:
                changes = self._fixup_changes(om.convert_to_python, replace=False)
        return {
            "changes": changes,
            "offset_type": offset_type
        }

    @staticmethod
    def from_dict(dictrepr, **kwargs):
        """
        Creates a ChangeLog from a dict representation.

        Args:
          dictrepr: the dict representation to convert
          **kwargs: ignored
        """
        if dictrepr is None:
            return None
        cl = ChangeLog()
        cl.changes = dictrepr.get("changes")
        cl.offset_type = dictrepr.get("offset_type")
        if cl.offset_type == OFFSET_TYPE_JAVA:
            # we need either an offset mapper or a document
            if "offset_mapper" in kwargs:
                om = kwargs.get("offset_mapper")
            elif "document" in kwargs:
                om = OffsetMapper(kwargs.get("document"))
            else:
                raise Exception("Loading a changelog with offset_type JAVA, need kwarg 'offset_mapper' or 'document'")
            cl._fixup_changes(om.convert_to_python)
        return cl

    def save(self, whereto, fmt="json", offset_type=None, offset_mapper=None, mod="gatenlp.serialization.default", **kwargs):
        """
        Save the document in the given format.
        
        Additional keyword parameters for format "json":
            as_array: boolean, if True stores as array instead of dictionary

        Args:
          whereto: either a file name or something that has a write(string) method.
          fmt: serialization format, one of "json", "msgpack" or "pickle" (Default value = "json")
          offset_type: store using the given offset type or keep the current if None (Default value = None)
          offset_mapper: nedded if the offset type should get changed (Default value = None)
          mod: module to use (Default value = "gatenlp.serialization.default")
          **kwargs: additional parameters for the format
        """
        m = importlib.import_module(mod)
        saver = m.get_changelog_saver(whereto, fmt)
        saver(ChangeLog, self, to_ext=whereto, offset_type=offset_type, offset_mapper=offset_mapper, **kwargs)

    def save_mem(self, fmt="json", offset_type=None, offset_mapper=None, mod="gatenlp.serialization.default", **kwargs):
        """
        Serialize and save to a string.
        
        Additional keyword parameters for format "json":
            as_array: boolean, if True stores as array instead of dictionary, using to

        Args:
          fmt: serialization format, one of "json", "msgpack" or "pickle" (Default value = "json")
          offset_type: store using the given offset type or keep the current if None (Default value = None)
          offset_mapper: nedded if the offset type should get changed (Default value = None)
          mod: module to use (Default value = "gatenlp.serialization.default")
          **kwargs: additional parameters for the format
        """
        m = importlib.import_module(mod)
        saver = m.get_changelog_saver(None, fmt)
        return saver(ChangeLog, self, to_mem=True, offset_type=offset_type, offset_mapper=offset_mapper, **kwargs)

    @staticmethod
    def load(wherefrom, fmt="json", offset_mapper=None, mod="gatenlp.serialization.default", **kwargs):
        """
        Load ChangeLog from some serialization.

        Args:
          wherefrom: the file or URL to load from
          offset_mapper: offset mapper in case the offsets need to get converted (Default value = None)
          fmt:  the format to use (Default value = "json")
          mod:  (Default value = "gatenlp.serialization.default")
          **kwargs: any arguments to pass on the the loader

        Returns:
            the ChangeLog instance
        """
        m = importlib.import_module(mod)
        loader = m.get_changelog_loader(wherefrom, fmt)
        chl = loader(ChangeLog, from_ext=wherefrom, offset_mapper=offset_mapper, **kwargs)
        if chl.offset_type == OFFSET_TYPE_JAVA:
            chl.fixup_changes(offset_mapper, offset_type=OFFSET_TYPE_PYTHON, replace=True)
        return chl

    @staticmethod
    def load_mem(wherefrom, fmt="json", offset_mapper=None, mod="gatenlp.serialization.default", **kwargs):
        """
        Load a ChangeLog from a string representation in the given format.

        Note: the offset type is always converted to PYTHON when loading!

        Args:
          wherefrom: the string to deserialize
          fmt: the format to use, default: "json"
          offset_mapper: offset mapper in case the offsets need to get converted (Default value = None)
          mod:  (Default value = "gatenlp.serialization.default")
          **kwargs: arguments to pass on to the loader

        Returns:
            the ChangeLog instance
        """
        m = importlib.import_module(mod)
        loader = m.get_changelog_loader(None, fmt)
        chl = loader(ChangeLog, from_mem=wherefrom, offset_mapper=offset_mapper, **kwargs)
        if chl.offset_type == OFFSET_TYPE_JAVA:
            chl.fixup_changes(offset_mapper, offset_type=OFFSET_TYPE_PYTHON, replace=True)
        return chl

    def pprint(self, out=None):
        """
        Pretty prints to the given output stream, sys.stdout if not given.

        Args:
          out:  the stream to print to, if None uses sys.stdout
        """
        if out is None:
            out = sys.stdout
        print("ChangeLog(", file=out)
        for i, c in enumerate(self.changes):
            cmd = c.get("command")
            parms = c.copy()
            del parms["command"]
            print(f"{i}: cmd={cmd} {parms}")
        print(")")

Static methods

def from_dict(dictrepr, **kwargs)

Creates a ChangeLog from a dict representation.

Args

dictrepr
the dict representation to convert
**kwargs
ignored
Expand source code
@staticmethod
def from_dict(dictrepr, **kwargs):
    """
    Creates a ChangeLog from a dict representation.

    Args:
      dictrepr: the dict representation to convert
      **kwargs: ignored
    """
    if dictrepr is None:
        return None
    cl = ChangeLog()
    cl.changes = dictrepr.get("changes")
    cl.offset_type = dictrepr.get("offset_type")
    if cl.offset_type == OFFSET_TYPE_JAVA:
        # we need either an offset mapper or a document
        if "offset_mapper" in kwargs:
            om = kwargs.get("offset_mapper")
        elif "document" in kwargs:
            om = OffsetMapper(kwargs.get("document"))
        else:
            raise Exception("Loading a changelog with offset_type JAVA, need kwarg 'offset_mapper' or 'document'")
        cl._fixup_changes(om.convert_to_python)
    return cl
def load(wherefrom, fmt='json', offset_mapper=None, mod='gatenlp.serialization.default', **kwargs)

Load ChangeLog from some serialization.

Args

wherefrom
the file or URL to load from
offset_mapper
offset mapper in case the offsets need to get converted (Default value = None)
fmt
the format to use (Default value = "json")
mod
(Default value = "gatenlp.serialization.default")
**kwargs
any arguments to pass on the the loader

Returns

the ChangeLog instance

Expand source code
@staticmethod
def load(wherefrom, fmt="json", offset_mapper=None, mod="gatenlp.serialization.default", **kwargs):
    """
    Load ChangeLog from some serialization.

    Args:
      wherefrom: the file or URL to load from
      offset_mapper: offset mapper in case the offsets need to get converted (Default value = None)
      fmt:  the format to use (Default value = "json")
      mod:  (Default value = "gatenlp.serialization.default")
      **kwargs: any arguments to pass on the the loader

    Returns:
        the ChangeLog instance
    """
    m = importlib.import_module(mod)
    loader = m.get_changelog_loader(wherefrom, fmt)
    chl = loader(ChangeLog, from_ext=wherefrom, offset_mapper=offset_mapper, **kwargs)
    if chl.offset_type == OFFSET_TYPE_JAVA:
        chl.fixup_changes(offset_mapper, offset_type=OFFSET_TYPE_PYTHON, replace=True)
    return chl
def load_mem(wherefrom, fmt='json', offset_mapper=None, mod='gatenlp.serialization.default', **kwargs)

Load a ChangeLog from a string representation in the given format.

Note: the offset type is always converted to PYTHON when loading!

Args

wherefrom
the string to deserialize
fmt
the format to use, default: "json"
offset_mapper
offset mapper in case the offsets need to get converted (Default value = None)
mod
(Default value = "gatenlp.serialization.default")
**kwargs
arguments to pass on to the loader

Returns

the ChangeLog instance

Expand source code
@staticmethod
def load_mem(wherefrom, fmt="json", offset_mapper=None, mod="gatenlp.serialization.default", **kwargs):
    """
    Load a ChangeLog from a string representation in the given format.

    Note: the offset type is always converted to PYTHON when loading!

    Args:
      wherefrom: the string to deserialize
      fmt: the format to use, default: "json"
      offset_mapper: offset mapper in case the offsets need to get converted (Default value = None)
      mod:  (Default value = "gatenlp.serialization.default")
      **kwargs: arguments to pass on to the loader

    Returns:
        the ChangeLog instance
    """
    m = importlib.import_module(mod)
    loader = m.get_changelog_loader(None, fmt)
    chl = loader(ChangeLog, from_mem=wherefrom, offset_mapper=offset_mapper, **kwargs)
    if chl.offset_type == OFFSET_TYPE_JAVA:
        chl.fixup_changes(offset_mapper, offset_type=OFFSET_TYPE_PYTHON, replace=True)
    return chl

Methods

def add_handler(self, actions, handler)

Registers a handler to get called back when any of the actions is added. If any handler was already registered for one or more of the actions, the new handler overrides it.

Args

actions
either a single action string or a collection of several action strings
handler
a callable that takes the change information
Expand source code
def add_handler(self, actions, handler):
    """
    Registers a handler to get called back when any of the actions is added.
    If any handler was already registered for one or more of the actions,
    the new handler overrides it.

    Args:
      actions: either a single action string or a collection of several action strings
      handler: a callable that takes the change information
    """
    if isinstance(actions, str):
        actions = [actions]
    for a in actions:
        if a not in ACTIONS:
            raise Exception(f"Action {a} not known, cannot add handler")
        self._handlers[a] = handler
def append(self, change: Dict)

Add a change to the change log. The change must be represented as a dictionary which follows the conventions of how to represent changes. This is not using an abstraction yet.

Args

change
dict describing the action/modification
Expand source code
def append(self, change: Dict):
    """
    Add a change to the change log. The change must be represented as a dictionary which follows the
    conventions of how to represent changes. This is not using an abstraction yet.

    Args:
      change: dict describing the action/modification
    """
    assert isinstance(change, dict)
    action = change.get("command",None)
    if action is None:
        raise Exception("Odd change, does not have 'command' key")
    if self._store:
        self.changes.append(change)
    hndlr = self._handlers.get(action)
    if hndlr:
        hndlr()
def fixup_changes(self, offset_mapper, offset_type, replace=True)

Update the offsets of all annotations in this changelog to the desired offset type, if necessary. If the ChangeLog already has that offset type, this does nothing.

Args

offset_mapper
a prepared offset mapper to use
offset_type
the desired offset type
replace
if True, replaces the original offsets in the original change objects, otherwise creates

new change objects and a new changes list and returs it. (Default value = True)

Returns

a reference to the modified changes

Expand source code
def fixup_changes(self, offset_mapper, offset_type, replace=True):
    """Update the offsets of all annotations in this changelog to the desired
    offset type, if necessary. If the ChangeLog already has that offset type, this does nothing.

    Args:
      offset_mapper: a prepared offset mapper to use
      offset_type: the desired offset type
      replace: if True, replaces the original offsets in the original change objects, otherwise creates
    new change objects and a new changes list and returs it. (Default value = True)

    Returns:
      a reference to the modified changes

    """
    if offset_type != self.offset_type:
        if offset_type == OFFSET_TYPE_JAVA:
            method = offset_mapper.convert_to_java
        elif offset_type == OFFSET_TYPE_PYTHON:
            method = offset_mapper.convert_to_python
        else:
            raise Exception("Not a proper offset type: {}".format(offset_type))
        if replace:
            self.offset_type = offset_type
        return self._fixup_changes(method, replace=replace)
    else:
        return self.changes
def format_to(self, fp, prefix='') ‑> NoneType

Prints the log to the given stream.

Args

fp
stream to print to
prefix
something to print in front of each action, default=""
Expand source code
def format_to(self, fp, prefix="") -> None:
    """
    Prints the log to the given stream.

    Args:
      fp: stream to print to
      prefix:  something to print in front of each action, default=""
    """
    for c in self.changes:
        print(prefix, str(c), sep="", file=fp)
def pprint(self, out=None)

Pretty prints to the given output stream, sys.stdout if not given.

Args

out
the stream to print to, if None uses sys.stdout
Expand source code
def pprint(self, out=None):
    """
    Pretty prints to the given output stream, sys.stdout if not given.

    Args:
      out:  the stream to print to, if None uses sys.stdout
    """
    if out is None:
        out = sys.stdout
    print("ChangeLog(", file=out)
    for i, c in enumerate(self.changes):
        cmd = c.get("command")
        parms = c.copy()
        del parms["command"]
        print(f"{i}: cmd={cmd} {parms}")
    print(")")
def save(self, whereto, fmt='json', offset_type=None, offset_mapper=None, mod='gatenlp.serialization.default', **kwargs)

Save the document in the given format.

Additional keyword parameters for format "json": as_array: boolean, if True stores as array instead of dictionary

Args

whereto
either a file name or something that has a write(string) method.
fmt
serialization format, one of "json", "msgpack" or "pickle" (Default value = "json")
offset_type
store using the given offset type or keep the current if None (Default value = None)
offset_mapper
nedded if the offset type should get changed (Default value = None)
mod
module to use (Default value = "gatenlp.serialization.default")
**kwargs
additional parameters for the format
Expand source code
def save(self, whereto, fmt="json", offset_type=None, offset_mapper=None, mod="gatenlp.serialization.default", **kwargs):
    """
    Save the document in the given format.
    
    Additional keyword parameters for format "json":
        as_array: boolean, if True stores as array instead of dictionary

    Args:
      whereto: either a file name or something that has a write(string) method.
      fmt: serialization format, one of "json", "msgpack" or "pickle" (Default value = "json")
      offset_type: store using the given offset type or keep the current if None (Default value = None)
      offset_mapper: nedded if the offset type should get changed (Default value = None)
      mod: module to use (Default value = "gatenlp.serialization.default")
      **kwargs: additional parameters for the format
    """
    m = importlib.import_module(mod)
    saver = m.get_changelog_saver(whereto, fmt)
    saver(ChangeLog, self, to_ext=whereto, offset_type=offset_type, offset_mapper=offset_mapper, **kwargs)
def save_mem(self, fmt='json', offset_type=None, offset_mapper=None, mod='gatenlp.serialization.default', **kwargs)

Serialize and save to a string.

Additional keyword parameters for format "json": as_array: boolean, if True stores as array instead of dictionary, using to

Args

fmt
serialization format, one of "json", "msgpack" or "pickle" (Default value = "json")
offset_type
store using the given offset type or keep the current if None (Default value = None)
offset_mapper
nedded if the offset type should get changed (Default value = None)
mod
module to use (Default value = "gatenlp.serialization.default")
**kwargs
additional parameters for the format
Expand source code
def save_mem(self, fmt="json", offset_type=None, offset_mapper=None, mod="gatenlp.serialization.default", **kwargs):
    """
    Serialize and save to a string.
    
    Additional keyword parameters for format "json":
        as_array: boolean, if True stores as array instead of dictionary, using to

    Args:
      fmt: serialization format, one of "json", "msgpack" or "pickle" (Default value = "json")
      offset_type: store using the given offset type or keep the current if None (Default value = None)
      offset_mapper: nedded if the offset type should get changed (Default value = None)
      mod: module to use (Default value = "gatenlp.serialization.default")
      **kwargs: additional parameters for the format
    """
    m = importlib.import_module(mod)
    saver = m.get_changelog_saver(None, fmt)
    return saver(ChangeLog, self, to_mem=True, offset_type=offset_type, offset_mapper=offset_mapper, **kwargs)
def to_dict(self, **kwargs)

Returns a dict representation of the ChangeLog.

Args

**kwargs
ignored
Expand source code
def to_dict(self, **kwargs):
    """
    Returns a dict representation of the ChangeLog.

    Args:
      **kwargs: ignored
    """
    offset_type = self.offset_type
    changes = self.changes
    if "offset_type" in kwargs and kwargs["offset_type"] != offset_type:
        om = kwargs.get("offset_mapper")
        if om is None:
            raise Exception("Need to convert offsets, but no offset_mapper parameter given")
        offset_type = kwargs["offset_type"]
        if offset_type == OFFSET_TYPE_JAVA:
            changes = self._fixup_changes(om.convert_to_java, replace=False)
        else:
            changes = self._fixup_changes(om.convert_to_python, replace=False)
    return {
        "changes": changes,
        "offset_type": offset_type
    }
class Document (text: str = None, features=None, changelog: ChangeLog = None)

Represent a GATE document. This is different from the original Java GATE representation in several ways:

  • the text is not mutable and can only be set at creation time, so there is no "edit" method

  • as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is no separate "FeatureMap" to store them

  • does not support listener callbacks

  • there is no separate abstraction for "content", the only content possible is text which is a unicode string that can be acessed with the "text()" method
  • Spans of text can be directly accessed using doc[from:to]
  • Features may only have string keys and values which can be json-serialised
  • Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets are UTF-16 Unicode code units
  • Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index of UTF-16 code unit and back)
  • No part of the document has to be present, not even the text (this allows saving just the annotations separately from the text)
  • Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly)

Args

text
the text of the document. The text can be None to indicate that no initial text should be set. Once

the text has been set for a document, it is immutable and cannot be changed. features: the initial document features to set, a sequence of key/value tuples changelog: a ChangeLog instance to use to log changes.

Returns:

Expand source code
class Document:
    """Represent a GATE document. This is different from the original Java GATE representation in several ways:
    
    * the text is not mutable and can only be set at creation time, so there is no "edit" method
    
    * as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is
      no separate "FeatureMap" to store them
    
    * does not support listener callbacks
    * there is no separate abstraction for "content", the only content possible is text which is a unicode string
      that can be acessed with the "text()" method
    * Spans of text can be directly accessed using doc[from:to]
    * Features may only have string keys and values which can be json-serialised
    * Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets
      are UTF-16 Unicode code units
    * Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index
      of UTF-16 code unit and back)
    * No part of the document has to be present, not even the text (this allows saving just the annotations separately
      from the text)
    * Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly)

    Args:
      text: the text of the document. The text can be None to indicate that no initial text should be set. Once
    the text has been set for a document, it is immutable and cannot be changed.
      features: the initial document features to set, a sequence of key/value tuples
      changelog: a ChangeLog instance to use to log changes.

    Returns:

    """

    def __init__(self, text: str = None, features=None, changelog: ChangeLog = None):
        if text is not None:
            assert isinstance(text, str)
        if changelog is not None:
            assert isinstance(changelog, ChangeLog)
        self._changelog = changelog
        self._features = Features(features, logger=self._log_feature_change)
        self._annotation_sets = dict()
        self._text = text
        self.offset_type = OFFSET_TYPE_PYTHON
        self._name = ""

    @property
    def name(self):
        """ """
        return self._name

    @name.setter
    def name(self, val):
        """

        Args:
          val: 

        Returns:

        """
        if val is None:
            val = ""
        if not isinstance(val, str):
            raise Exception("Name must be a string")
        self._name = val
        if self._changelog is not None:
            ch = {"command": "name:set"}
            ch["name"] = val
            self._changelog.append(ch)

    def _ensure_type_python(self) -> None:
        """ """
        if self.offset_type != OFFSET_TYPE_PYTHON:
            raise Exception("Document cannot be used if it is not type PYTHON, use to_type(OFFSET_TYPE_PYTHON) first")

    def _fixup_annotations(self, method: Callable) -> None:
        """

        Args:
          method: Callable: 

        Returns:

        """
        annset_names = self._annotation_sets.keys()
        for annset_name in annset_names:
            annset = self._annotation_sets[annset_name]
            if annset._annotations is not None:
                for ann in annset._annotations.values():
                    ann._start = method(ann._start)
                    ann._end = method(ann._end)

    def to_offset_type(self, offsettype: str) -> OffsetMapper:
        """Convert all the offsets of all the annotations in this document to the
        required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets
        are already of that type, this does nothing.
        
        NOTE: if the document has a ChangeLog, it is NOT also converted!
        
        The method returns the offset mapper if anything actually was converted,
        otherwise None.

        Args:
          offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
          offsettype: str: 

        Returns:
          offset mapper or None

        """
        om = None
        if offsettype == self.offset_type:
            return
        if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON:
            # convert from currently python to java
            om = OffsetMapper(self._text)
            self._fixup_annotations(om.convert_to_java)
            self.offset_type = OFFSET_TYPE_JAVA
        elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA:
            # convert from currently java to python
            om = OffsetMapper(self._text)
            self._fixup_annotations(om.convert_to_python)
            self.offset_type = OFFSET_TYPE_PYTHON
        else:
            raise Exception("Odd offset type")
        return om

    def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID):
        """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance,
        a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.
        
        The document is modified in-place.

        Args:
          changes: one or more changes
          handle_existing_anns: what to do if the change from the changelog tries to add an annotation
        with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)

        Returns:

        """
        if isinstance(changes, dict):
            changes = [changes]
        elif isinstance(changes, ChangeLog):
            changes = changes.changes
        for change in changes:
            cmd = change.get("command")
            fname = change.get("feature")
            fvalue = change.get("value")
            features = change.get("features")
            sname = change.get("set")
            annid = change.get("id")
            if cmd is None:
                raise Exception("Change without field 'command'")
            if cmd == ACTION_ADD_ANNSET:
                assert sname is not None
                self.annset(sname)
            elif cmd == ACTION_ADD_ANN:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                start = change.get("start")
                end = change.get("end")
                anntype = change.get("type")

                if ann is None:
                    anns.add(start, end, anntype, annid=annid, features=features)
                else:
                    if handle_existing_anns == ADDANN_IGNORE:
                        pass
                    elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID:
                        anns.add(start, end, anntype)
                    elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION:
                        anns.remove(annid)
                        anns.add(start, end, anntype, annid)
                    elif handle_existing_anns == ADDANN_UPDATE_FEATURES:
                        ann.features.update(features)
                    elif handle_existing_anns == ADDANN_REPLACE_FEATURES:
                        ann.features.clear()
                        ann.features.update(features)
                    elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES:
                        fns = ann.feature_names()
                        for f in features.keys():
                            if f not in fns:
                                ann.features[f] = features[f]

            elif cmd == ACTION_CLEAR_ANNS:
                assert sname is not None
                anns = self.annset(sname)
                anns.clear()
            elif cmd == ACTION_CLEAR_ANN_FEATURES:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                if ann is not None:
                    ann.features.clear()
                else:
                    pass # ignore, could happen with a detached annotation
            elif cmd == ACTION_CLEAR_DOC_FEATURES:
                self.features.clear()
            elif cmd == ACTION_SET_ANN_FEATURE:
                assert fname is not None
                assert sname is not None
                assert annid is not None
                ann = self.annset(sname).get(annid)
                ann.features[fname] = fvalue
            elif cmd == ACTION_DEL_ANN_FEATURE:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                if ann is not None:
                    if fname is not None:
                        ann.features.pop(fname, None)
                else:
                    pass  # ignore, could happen with a detached annotation
            elif cmd == ACTION_DEL_DOC_FEATURE:
                assert fname is not None
                self.features.pop(fname, None)
            elif cmd == ACTION_DEL_ANN:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                anns.remove(annid)
            elif cmd == ACTION_SET_DOC_FEATURE:
                assert fname is not None
                self.features[fname] = fvalue
            elif cmd == ACTION_CLEAR_DOC_FEATURES:
                self._features.clear()
            elif cmd == ACTION_DEL_DOC_FEATURE:
                assert fname is not None
                del self._features[fname]
            else:
                raise Exception("Unknown ChangeLog action: ", cmd)

    @property
    def features(self):
        """Accesses the features as a FeatureViewer instance. Changes made on this object are
        reflected in the document and recorded in the change log, if there is one.
        
        :return: A FeatureViewer view of the document features.

        Args:

        Returns:

        """
        return self._features


    @property
    def changelog(self):
        """Get the ChangeLog or None if no ChangeLog has been set.
        
        :return: the changelog

        Args:

        Returns:

        """
        return self._changelog

    @changelog.setter
    def changelog(self, chlog):
        """Make the document use the given changelog to record all changes
        from this moment on.

        Args:
          chlog: the new changelog to use or None to not use any

        Returns:
          the changelog used previously or None

        """
        oldchlog = self._changelog
        self._changelog = chlog
        return oldchlog

    @property
    def text(self) -> str:
        """Get the text of the document. For a partial document, the text may be None.
        
        :return: the text of the document

        Args:

        Returns:

        """
        self._ensure_type_python()
        return self._text

    @text.setter
    def text(self, value: str) -> None:
        """Set the text of the document. This is only possible as long as it has not been set
        yet, after that, the text is immutable.

        Args:
          value: the text for the document
          value: str: 

        Returns:

        """
        if self._text is None:
            self._text = value
        else:
            raise NotImplementedError("Text cannot be modified")

    def _log_feature_change(self, command: str, feature: str = None, value=None) -> None:
        """

        Args:
          command: str: 
          feature: str:  (Default value = None)
          value:  (Default value = None)

        Returns:

        """
        if self._changelog is None:
            return
        command = "doc-"+command
        ch = {"command": command}
        if command == "doc-feature:set":
            ch["feature"] = feature
            ch["value"] = value
        self._changelog.append(ch)

    def __len__(self) -> int:
        """
        Return the length of the text.
        Note: this will convert the type of the document to python!

        :return: the length of the document text
        """
        self._ensure_type_python()
        if self._text is None:
            return 0
        else:
            return len(self._text)

    def __getitem__(self, span) -> str:
        """
        Get the text for the given span.

        :param span: a single number, an offset range of the form from:to or an annotation.
        If annotation, uses the annotation's offset span.
        :return: the text of the span
        """
        self._ensure_type_python()
        if isinstance(span, Annotation):
            return self.text[span._start:span._end]
        if isinstance(span, AnnotationSet):
            return self.text[span.start():span.end()]
        return self.text[span]

    def annset(self, name: str = "") -> AnnotationSet:
        """Get the named annotation set, if name is not given or the empty string, the default annotation set.
        If the annotation set does not already exist, it is created.

        Args:
          name: the annotation set name, the empty string is used for the "default annotation set".
          name: str:  (Default value = "")

        Returns:
          the specified annotation set.

        """
        self._ensure_type_python()
        if name not in self._annotation_sets:
            annset = AnnotationSet(owner_doc=self, name=name)
            self._annotation_sets[name] = annset
            if self._changelog:
                self._changelog.append({
                    "command": "annotations:add",
                    "set": name})
            return annset
        else:
            return self._annotation_sets[name]

    def annset_names(self) -> KeysView[str]:
        """

        Args:

        Returns:
          :return: annotation set names

        """
        self._ensure_type_python()
        return list(self._annotation_sets.keys())
    
    def remove_annset(self, name: str):
        """Completely remove the annotation set.

        Args:
          name: name of the annotation set to remove
          name: str: 

        Returns:

        """
        if name not in self._annotation_sets:
            raise Exception(f"AnnotationSet with name {name} does not exist")
        del self._annotation_sets[name]
        if self._changelog:
            self._changelog.append({
                "command": "annotations:remove",
                "set": name})

    def __repr__(self) -> str:
        """
        String representation of the document, showing all content.

        :return: string representation
        """
        return "Document({},features={},anns={})".format(self.text, self._features, self._annotation_sets.__repr__())

    def __str__(self) -> str:
        asets = "["+",".join([f"'{k}':{len(v)}" for k, v in self._annotation_sets.items()])+"]"
        return "Document({},features={},anns={})".format(self.text, self._features, asets)

    def to_dict(self, offset_type=None, **kwargs):
        """Convert this instance to a dictionary that can be used to re-create the instance with
        from_dict.
        NOTE: if there is an active changelog, it is not included in the output as this
        field is considered a transient field!

        Args:
          offset_type: convert to the given offset type on the fly (Default value = None)
          **kwargs: 

        Returns:
          the dictionary representation of this instance

        """
        # if the specified offset type is equal to what we have, do nothing, otherwise
        # create an offset mapper and pass it down to where we actually convert the annotations

        om = None
        if offset_type is not None:
            assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON
            if offset_type != self.offset_type:
                if self._text is not None:
                    om = OffsetMapper(self._text)
                    kwargs["offset_mapper"] = om
                    kwargs["offset_type"] = offset_type
        else:
            offset_type = self.offset_type

        return {
            "annotation_sets": {name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() },
            "text": self._text,
            "features": self._features.to_dict(),
            "offset_type": offset_type,
            "name": self.name,
        }

    @staticmethod
    def from_dict(dictrepr, **kwargs):
        """Return a Document instance as represented by the dictionary dictrepr.

        Args:
          dictrepr: return: the initialized Document instance
          **kwargs:

        Returns:
          the initialized Document instance

        """
        feats = dictrepr.get("features")
        doc = Document(dictrepr.get("text"), features=feats)
        doc.name = dictrepr.get("name")
        doc.offset_type = dictrepr.get("offset_type")
        if doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON:
            raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
        annsets = {name: AnnotationSet.from_dict(adict, owner_doc=doc)
                   for name, adict in dictrepr.get("annotation_sets").items()}
        doc._annotation_sets = annsets
        return doc

    def save(self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", **kwargs):
        """Save the document to the destination file.

        Args:
          destination: either a file name or something that has a write(string) method.
          fmt: serialization format, by default the format is inferred from the file extension.
          offset_type: store using the given offset type or keep the current if None (Default value = None)
          mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional parameters for the document saver.
          **kwargs: 

        Returns:

        """
        if fmt is None or isinstance(fmt, str):
            m = importlib.import_module(mod)
            saver = m.get_document_saver(destination, fmt)
            saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
        else:
            # assume fmt is a callable to get used directly
            fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)

    def save_mem(self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs):
        """Serialize to a string or bytes in the given format.

        Args:
          fmt: serialization format to use. (Default value = "json")
          offset_type: store using the given offset type or keep the current if None (Default value = None)
          mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional parameters for the format.
          **kwargs: 

        Returns:

        """
        if not fmt:
            raise Exception("Format required.")
        if isinstance(fmt, str):
            m = importlib.import_module(mod)
            saver = m.get_document_saver(None, fmt)
            return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
        else:
            fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs)

    @staticmethod
    def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs):
        """Load or import a document from the given source. The source can be a file path or file name or
        a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated
        as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to
        deliberately use URL instead of a file parse the URL using urllib.
        
        Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)`
        
        Example: `Document.load(pathlib.Path(somepath), fmt=theformat)`
        
        NOTE: the offset type of the document is always converted to PYTHON when loading!

        Args:
          source: the URL or file path to load from.
          fmt: the format of the source. By default the format is inferred by the file extension.
        The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs".
          mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional format specific keyword arguments to pass to the loader
          **kwargs: 

        Returns:
          the loaded document

        """
        if fmt is None or isinstance(fmt, str):
            m = importlib.import_module(mod)
            loader = m.get_document_loader(source, fmt)
            doc = loader(Document, from_ext=source, **kwargs)
        else:
            doc = fmt(Document, from_ext=source, **kwargs)
        if doc.offset_type == OFFSET_TYPE_JAVA:
            doc.to_offset_type(OFFSET_TYPE_PYTHON)
        return doc

    @staticmethod
    def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs):
        """Create a document from the in-memory serialization in source. Source can be a string or
        bytes, depending on the format.
        
        Note: the offset type is always converted to PYTHON when loading!

        Args:
          source: the string/bytes to deserialize
          fmt: the format (Default value = "json")
          mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
          kwargs: additional arguments to pass to the loader
          **kwargs: 

        Returns:

        """
        if not fmt:
            raise Exception("Format required.")
        if isinstance(fmt, str):
            m = importlib.import_module(mod)
            loader = m.get_document_loader(None, fmt)
            doc = loader(Document, from_mem=source, **kwargs)
        else:
            doc = fmt(Document, from_mem=source, **kwargs)
        if doc.offset_type == OFFSET_TYPE_JAVA:
            doc.to_offset_type(OFFSET_TYPE_PYTHON)
        return doc

    def __copy__(self):
        """
        Creates a shallow copy except the changelog which is set to None.

        :return: shallow copy of the document
        """
        doc = Document(self._text)
        doc._annotation_sets = self._annotation_sets
        doc.offset_type = self.offset_type
        doc._features = self._features.copy()
        return doc

    def copy(self):
        """Creates a shallow copy except the changelog which is set to None.
        
        :return: shallow copy of the document

        Args:

        Returns:

        """
        return self.__copy__()

    def __deepcopy__(self, memo):
        """
        Creates a deep copy, except the changelog which is set to None.

        :param memo: the memoization dictionary to use.

        :return: a deep copy of the document.
        """
        if self._features is not None:
            fts = lib_copy.deepcopy(self._features.to_dict(), memo)
        else:
            fts = None
        doc = Document(self._text, features=fts)
        doc._changelog = None
        doc._annotation_sets = lib_copy.deepcopy(self._annotation_sets, memo)
        doc.offset_type = self.offset_type
        return doc

    def deepcopy(self, memo=None):
        """Creates a deep copy, except the changelog which is set to None.

        Args:
          memo: the memoization dictionary to use.

        Returns:
          a deep copy of the document.

        """
        return lib_copy.deepcopy(self, memo=memo)

    def _repr_html_(self):
        """
        Render function for Jupyter notebooks. Returns the html-ann-viewer HTML.
        This renders the HTML for notebook, for offline mode, but does not add the JS
        but instead initializes the JS in the notebook unless gatenlp.init_notebook()
        has bee called already.
        """
        return self._notebook_show()

    def notebook_show(self, htmlid=None):
        """
        Show the document in a Jupyter notebook. This allows to assign a specific htmlid so
        the generated HTML can be directly styled afterwards.
        This directly sends the rendered document to the cell (no display/HTML necessary).

        Args:
            htmlid: the HTML id prefix to use for classes and element ids.

        """
        self._notebook_show(htmlid=htmlid, display=True)

    def _notebook_show(self, htmlid=None, display=False):
        from gatenlp.gatenlpconfig import gatenlpconfig
        from gatenlp.serialization.default import HtmlAnnViewerSerializer
        from IPython.display import display_html
        if not gatenlpconfig.notebook_js_initialized:
            HtmlAnnViewerSerializer.init_javscript()
            gatenlpconfig.notebook_js_initialized = True
        html = self.save_mem(fmt="html-ann-viewer",
                             notebook=True,
                             add_js = False,
                             offline=True,
                             htmlid=htmlid)
        if display:
            display_html(html, raw=True)
        else:
            return html

Subclasses

Static methods

def from_dict(dictrepr, **kwargs)

Return a Document instance as represented by the dictionary dictrepr.

Args

dictrepr
return: the initialized Document instance

**kwargs:

Returns

the initialized Document instance

Expand source code
@staticmethod
def from_dict(dictrepr, **kwargs):
    """Return a Document instance as represented by the dictionary dictrepr.

    Args:
      dictrepr: return: the initialized Document instance
      **kwargs:

    Returns:
      the initialized Document instance

    """
    feats = dictrepr.get("features")
    doc = Document(dictrepr.get("text"), features=feats)
    doc.name = dictrepr.get("name")
    doc.offset_type = dictrepr.get("offset_type")
    if doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON:
        raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
    annsets = {name: AnnotationSet.from_dict(adict, owner_doc=doc)
               for name, adict in dictrepr.get("annotation_sets").items()}
    doc._annotation_sets = annsets
    return doc
def load(source, fmt=None, mod='gatenlp.serialization.default', **kwargs)

Load or import a document from the given source. The source can be a file path or file name or a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse the URL using urllib.

Example: Document.load(urllib.parse.urlparse(someurl), fmt=theformat)

Example: Document.load(pathlib.Path(somepath), fmt=theformat)

NOTE: the offset type of the document is always converted to PYTHON when loading!

Args

source
the URL or file path to load from.
fmt
the format of the source. By default the format is inferred by the file extension.

The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs". mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional format specific keyword arguments to pass to the loader **kwargs:

Returns

the loaded document

Expand source code
@staticmethod
def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs):
    """Load or import a document from the given source. The source can be a file path or file name or
    a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated
    as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to
    deliberately use URL instead of a file parse the URL using urllib.
    
    Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)`
    
    Example: `Document.load(pathlib.Path(somepath), fmt=theformat)`
    
    NOTE: the offset type of the document is always converted to PYTHON when loading!

    Args:
      source: the URL or file path to load from.
      fmt: the format of the source. By default the format is inferred by the file extension.
    The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs".
      mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default")
      kwargs: additional format specific keyword arguments to pass to the loader
      **kwargs: 

    Returns:
      the loaded document

    """
    if fmt is None or isinstance(fmt, str):
        m = importlib.import_module(mod)
        loader = m.get_document_loader(source, fmt)
        doc = loader(Document, from_ext=source, **kwargs)
    else:
        doc = fmt(Document, from_ext=source, **kwargs)
    if doc.offset_type == OFFSET_TYPE_JAVA:
        doc.to_offset_type(OFFSET_TYPE_PYTHON)
    return doc
def load_mem(source, fmt='json', mod='gatenlp.serialization.default', **kwargs)

Create a document from the in-memory serialization in source. Source can be a string or bytes, depending on the format.

Note: the offset type is always converted to PYTHON when loading!

Args

source
the string/bytes to deserialize
fmt
the format (Default value = "json")
mod
the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
kwargs
additional arguments to pass to the loader
**kwargs
 

Returns:

Expand source code
@staticmethod
def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs):
    """Create a document from the in-memory serialization in source. Source can be a string or
    bytes, depending on the format.
    
    Note: the offset type is always converted to PYTHON when loading!

    Args:
      source: the string/bytes to deserialize
      fmt: the format (Default value = "json")
      mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
      kwargs: additional arguments to pass to the loader
      **kwargs: 

    Returns:

    """
    if not fmt:
        raise Exception("Format required.")
    if isinstance(fmt, str):
        m = importlib.import_module(mod)
        loader = m.get_document_loader(None, fmt)
        doc = loader(Document, from_mem=source, **kwargs)
    else:
        doc = fmt(Document, from_mem=source, **kwargs)
    if doc.offset_type == OFFSET_TYPE_JAVA:
        doc.to_offset_type(OFFSET_TYPE_PYTHON)
    return doc

Instance variables

property/get/set changelog

Get the ChangeLog or None if no ChangeLog has been set.

:return: the changelog

Args:

Returns:

Expand source code
@property
def changelog(self):
    """Get the ChangeLog or None if no ChangeLog has been set.
    
    :return: the changelog

    Args:

    Returns:

    """
    return self._changelog
property/get features

Accesses the features as a FeatureViewer instance. Changes made on this object are reflected in the document and recorded in the change log, if there is one.

:return: A FeatureViewer view of the document features.

Args:

Returns:

Expand source code
@property
def features(self):
    """Accesses the features as a FeatureViewer instance. Changes made on this object are
    reflected in the document and recorded in the change log, if there is one.
    
    :return: A FeatureViewer view of the document features.

    Args:

    Returns:

    """
    return self._features
property/get/set name
Expand source code
@property
def name(self):
    """ """
    return self._name
property/get/set text : str

Get the text of the document. For a partial document, the text may be None.

:return: the text of the document

Args:

Returns:

Expand source code
@property
def text(self) -> str:
    """Get the text of the document. For a partial document, the text may be None.
    
    :return: the text of the document

    Args:

    Returns:

    """
    self._ensure_type_python()
    return self._text

Methods

def annset(self, name: str = '') ‑> AnnotationSet

Get the named annotation set, if name is not given or the empty string, the default annotation set. If the annotation set does not already exist, it is created.

Args

name
the annotation set name, the empty string is used for the "default annotation set".
name
str: (Default value = "")

Returns

the specified annotation set.

Expand source code
def annset(self, name: str = "") -> AnnotationSet:
    """Get the named annotation set, if name is not given or the empty string, the default annotation set.
    If the annotation set does not already exist, it is created.

    Args:
      name: the annotation set name, the empty string is used for the "default annotation set".
      name: str:  (Default value = "")

    Returns:
      the specified annotation set.

    """
    self._ensure_type_python()
    if name not in self._annotation_sets:
        annset = AnnotationSet(owner_doc=self, name=name)
        self._annotation_sets[name] = annset
        if self._changelog:
            self._changelog.append({
                "command": "annotations:add",
                "set": name})
        return annset
    else:
        return self._annotation_sets[name]
def annset_names(self) ‑> KeysView[str]

Args:

Returns

:return: annotation set names

Expand source code
def annset_names(self) -> KeysView[str]:
    """

    Args:

    Returns:
      :return: annotation set names

    """
    self._ensure_type_python()
    return list(self._annotation_sets.keys())
def apply_changes(self, changes, handle_existing_anns='add-with-new-id')

Apply changes from a ChangeLog to this document. changes can be a ChangeLog instance, a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.

The document is modified in-place.

Args

changes
one or more changes
handle_existing_anns
what to do if the change from the changelog tries to add an annotation

with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)

Returns:

Expand source code
def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID):
    """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance,
    a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.
    
    The document is modified in-place.

    Args:
      changes: one or more changes
      handle_existing_anns: what to do if the change from the changelog tries to add an annotation
    with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)

    Returns:

    """
    if isinstance(changes, dict):
        changes = [changes]
    elif isinstance(changes, ChangeLog):
        changes = changes.changes
    for change in changes:
        cmd = change.get("command")
        fname = change.get("feature")
        fvalue = change.get("value")
        features = change.get("features")
        sname = change.get("set")
        annid = change.get("id")
        if cmd is None:
            raise Exception("Change without field 'command'")
        if cmd == ACTION_ADD_ANNSET:
            assert sname is not None
            self.annset(sname)
        elif cmd == ACTION_ADD_ANN:
            assert sname is not None
            assert annid is not None
            anns = self.annset(sname)
            ann = anns.get(annid)
            start = change.get("start")
            end = change.get("end")
            anntype = change.get("type")

            if ann is None:
                anns.add(start, end, anntype, annid=annid, features=features)
            else:
                if handle_existing_anns == ADDANN_IGNORE:
                    pass
                elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID:
                    anns.add(start, end, anntype)
                elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION:
                    anns.remove(annid)
                    anns.add(start, end, anntype, annid)
                elif handle_existing_anns == ADDANN_UPDATE_FEATURES:
                    ann.features.update(features)
                elif handle_existing_anns == ADDANN_REPLACE_FEATURES:
                    ann.features.clear()
                    ann.features.update(features)
                elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES:
                    fns = ann.feature_names()
                    for f in features.keys():
                        if f not in fns:
                            ann.features[f] = features[f]

        elif cmd == ACTION_CLEAR_ANNS:
            assert sname is not None
            anns = self.annset(sname)
            anns.clear()
        elif cmd == ACTION_CLEAR_ANN_FEATURES:
            assert sname is not None
            assert annid is not None
            anns = self.annset(sname)
            ann = anns.get(annid)
            if ann is not None:
                ann.features.clear()
            else:
                pass # ignore, could happen with a detached annotation
        elif cmd == ACTION_CLEAR_DOC_FEATURES:
            self.features.clear()
        elif cmd == ACTION_SET_ANN_FEATURE:
            assert fname is not None
            assert sname is not None
            assert annid is not None
            ann = self.annset(sname).get(annid)
            ann.features[fname] = fvalue
        elif cmd == ACTION_DEL_ANN_FEATURE:
            assert sname is not None
            assert annid is not None
            anns = self.annset(sname)
            ann = anns.get(annid)
            if ann is not None:
                if fname is not None:
                    ann.features.pop(fname, None)
            else:
                pass  # ignore, could happen with a detached annotation
        elif cmd == ACTION_DEL_DOC_FEATURE:
            assert fname is not None
            self.features.pop(fname, None)
        elif cmd == ACTION_DEL_ANN:
            assert sname is not None
            assert annid is not None
            anns = self.annset(sname)
            anns.remove(annid)
        elif cmd == ACTION_SET_DOC_FEATURE:
            assert fname is not None
            self.features[fname] = fvalue
        elif cmd == ACTION_CLEAR_DOC_FEATURES:
            self._features.clear()
        elif cmd == ACTION_DEL_DOC_FEATURE:
            assert fname is not None
            del self._features[fname]
        else:
            raise Exception("Unknown ChangeLog action: ", cmd)
def copy(self)

Creates a shallow copy except the changelog which is set to None.

:return: shallow copy of the document

Args:

Returns:

Expand source code
def copy(self):
    """Creates a shallow copy except the changelog which is set to None.
    
    :return: shallow copy of the document

    Args:

    Returns:

    """
    return self.__copy__()
def deepcopy(self, memo=None)

Creates a deep copy, except the changelog which is set to None.

Args

memo
the memoization dictionary to use.

Returns

a deep copy of the document.

Expand source code
def deepcopy(self, memo=None):
    """Creates a deep copy, except the changelog which is set to None.

    Args:
      memo: the memoization dictionary to use.

    Returns:
      a deep copy of the document.

    """
    return lib_copy.deepcopy(self, memo=memo)
def notebook_show(self, htmlid=None)

Show the document in a Jupyter notebook. This allows to assign a specific htmlid so the generated HTML can be directly styled afterwards. This directly sends the rendered document to the cell (no display/HTML necessary).

Args

htmlid
the HTML id prefix to use for classes and element ids.
Expand source code
def notebook_show(self, htmlid=None):
    """
    Show the document in a Jupyter notebook. This allows to assign a specific htmlid so
    the generated HTML can be directly styled afterwards.
    This directly sends the rendered document to the cell (no display/HTML necessary).

    Args:
        htmlid: the HTML id prefix to use for classes and element ids.

    """
    self._notebook_show(htmlid=htmlid, display=True)
def remove_annset(self, name: str)

Completely remove the annotation set.

Args

name
name of the annotation set to remove
name
str:

Returns:

Expand source code
def remove_annset(self, name: str):
    """Completely remove the annotation set.

    Args:
      name: name of the annotation set to remove
      name: str: 

    Returns:

    """
    if name not in self._annotation_sets:
        raise Exception(f"AnnotationSet with name {name} does not exist")
    del self._annotation_sets[name]
    if self._changelog:
        self._changelog.append({
            "command": "annotations:remove",
            "set": name})
def save(self, destination, fmt=None, offset_type=None, mod='gatenlp.serialization.default', **kwargs)

Save the document to the destination file.

Args

destination
either a file name or something that has a write(string) method.
fmt
serialization format, by default the format is inferred from the file extension.
offset_type
store using the given offset type or keep the current if None (Default value = None)
mod
module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
kwargs
additional parameters for the document saver.
**kwargs
 

Returns:

Expand source code
def save(self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", **kwargs):
    """Save the document to the destination file.

    Args:
      destination: either a file name or something that has a write(string) method.
      fmt: serialization format, by default the format is inferred from the file extension.
      offset_type: store using the given offset type or keep the current if None (Default value = None)
      mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
      kwargs: additional parameters for the document saver.
      **kwargs: 

    Returns:

    """
    if fmt is None or isinstance(fmt, str):
        m = importlib.import_module(mod)
        saver = m.get_document_saver(destination, fmt)
        saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
    else:
        # assume fmt is a callable to get used directly
        fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
def save_mem(self, fmt='json', offset_type=None, mod='gatenlp.serialization.default', **kwargs)

Serialize to a string or bytes in the given format.

Args

fmt
serialization format to use. (Default value = "json")
offset_type
store using the given offset type or keep the current if None (Default value = None)
mod
module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
kwargs
additional parameters for the format.
**kwargs
 

Returns:

Expand source code
def save_mem(self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs):
    """Serialize to a string or bytes in the given format.

    Args:
      fmt: serialization format to use. (Default value = "json")
      offset_type: store using the given offset type or keep the current if None (Default value = None)
      mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
      kwargs: additional parameters for the format.
      **kwargs: 

    Returns:

    """
    if not fmt:
        raise Exception("Format required.")
    if isinstance(fmt, str):
        m = importlib.import_module(mod)
        saver = m.get_document_saver(None, fmt)
        return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
    else:
        fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
def to_dict(self, offset_type=None, **kwargs)

Convert this instance to a dictionary that can be used to re-create the instance with from_dict. NOTE: if there is an active changelog, it is not included in the output as this field is considered a transient field!

Args

offset_type
convert to the given offset type on the fly (Default value = None)
**kwargs
 

Returns

the dictionary representation of this instance

Expand source code
def to_dict(self, offset_type=None, **kwargs):
    """Convert this instance to a dictionary that can be used to re-create the instance with
    from_dict.
    NOTE: if there is an active changelog, it is not included in the output as this
    field is considered a transient field!

    Args:
      offset_type: convert to the given offset type on the fly (Default value = None)
      **kwargs: 

    Returns:
      the dictionary representation of this instance

    """
    # if the specified offset type is equal to what we have, do nothing, otherwise
    # create an offset mapper and pass it down to where we actually convert the annotations

    om = None
    if offset_type is not None:
        assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON
        if offset_type != self.offset_type:
            if self._text is not None:
                om = OffsetMapper(self._text)
                kwargs["offset_mapper"] = om
                kwargs["offset_type"] = offset_type
    else:
        offset_type = self.offset_type

    return {
        "annotation_sets": {name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() },
        "text": self._text,
        "features": self._features.to_dict(),
        "offset_type": offset_type,
        "name": self.name,
    }
def to_offset_type(self, offsettype: str) ‑> OffsetMapper

Convert all the offsets of all the annotations in this document to the required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets are already of that type, this does nothing.

NOTE: if the document has a ChangeLog, it is NOT also converted!

The method returns the offset mapper if anything actually was converted, otherwise None.

Args

offsettype
either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
offsettype
str:

Returns

offset mapper or None

Expand source code
def to_offset_type(self, offsettype: str) -> OffsetMapper:
    """Convert all the offsets of all the annotations in this document to the
    required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets
    are already of that type, this does nothing.
    
    NOTE: if the document has a ChangeLog, it is NOT also converted!
    
    The method returns the offset mapper if anything actually was converted,
    otherwise None.

    Args:
      offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
      offsettype: str: 

    Returns:
      offset mapper or None

    """
    om = None
    if offsettype == self.offset_type:
        return
    if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON:
        # convert from currently python to java
        om = OffsetMapper(self._text)
        self._fixup_annotations(om.convert_to_java)
        self.offset_type = OFFSET_TYPE_JAVA
    elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA:
        # convert from currently java to python
        om = OffsetMapper(self._text)
        self._fixup_annotations(om.convert_to_python)
        self.offset_type = OFFSET_TYPE_PYTHON
    else:
        raise Exception("Odd offset type")
    return om