Package gatenlp
Expand source code
# NOTE: do not place a comment at the end of the version assignment
# line since we parse that line in a shell script!
__version__ = "0.9.8"
import logging
import sys
try:
import sortedcontainers
except Exception as ex:
import sys
print(
"ERROR: required package sortedcontainers cannot be imported!", file=sys.stderr
)
print(
"Please install it, using e.g. 'pip install -U sortedcontainers'",
file=sys.stderr,
)
sys.exit(1)
# TODO: check version of sortedcontainers (we have 2.1.0)
from gatenlp.utils import init_logger
logger = init_logger("gatenlp")
# this attribute globally holds the processing resource last defined
# so it can be used for interacting with the GATE python plugin
from gatenlp.gate_interaction import _pr_decorator as GateNlpPr
from gatenlp.gate_interaction import interact
from gatenlp.annotation import Annotation
from gatenlp.document import Document
from gatenlp.annotation_set import AnnotationSet
from gatenlp.changelog import ChangeLog
from gatenlp.gateslave import GateSlave
from gatenlp.span import Span
def init_notebook():
from gatenlp.serialization.default import HtmlAnnViewerSerializer
from gatenlp.gatenlpconfig import gatenlpconfig
HtmlAnnViewerSerializer.init_javscript()
gatenlpconfig.notebook_js_initialized = True
__all__ = [
"GateNlpPr",
"Annotation",
"Document",
"AnnotationSet",
"ChangeLog",
"logger",
]
gate_python_plugin_pr = None
Sub-modules
gatenlp.annotation
-
Module for Annotation class which represents information about a span of text in a document.
gatenlp.annotation_set
-
Module for AnnotationSet class which represents a named collection of annotations which can arbitrarily overlap.
gatenlp.changelog
-
Module for ChangeLog class which represents a log of changes to any of the components of a Document: document features, annotations, annotation features.
gatenlp.corpora
-
Module that defines base and implementation classes for representing document collections …
gatenlp.document
-
Module that implements the Document class for representing gatenlp documents with features and annotation sets.
gatenlp.features
-
Module that implements class Feature for representing features.
gatenlp.gate_interaction
-
Support for interacting between a GATE (java) process and a gatenlp (Python) process. This is used by the Java GATE Python plugin.
gatenlp.gatenlpconfig
-
Module that provides the class GatenlpConfig and the instance gatenlpconfig which stores various global configuration options.
gatenlp.gateslave
-
Module for interacting with a Java GATE process, running API commands on it and exchanging data with it.
gatenlp.impl
-
This subpackage contains modules for (temporary) implementation of datastructures and algorithms needed. Some of these may get replaced by other …
gatenlp.lang
gatenlp.lib_spacy
-
Support for using spacy: convert from spacy to gatenlp documents and annotations.
gatenlp.lib_stanfordnlp
-
Support for using stanfordnlp: convert from stanfordnlp output to gatenlp documents and annotations.
gatenlp.lib_stanza
-
Support for using stanford stanza (see https://stanfordnlp.github.io/stanza/): convert from stanford Stanza output to gatenlp documents and annotations.
gatenlp.offsetmapper
-
Module that implements the OffsetMapper class for mapping between Java-style and Python-style string offsets. Java strings are represented as UTF16 …
gatenlp.pam
gatenlp.processing
gatenlp.serialization
gatenlp.span
-
Module for Span class
gatenlp.utils
-
Various utilities that could be useful in several modules.
Functions
def GateNlpPr(what)
-
This is the decorator to identify a class or function as a processing resource. This is made available with the name PR in the gatenlp package.
This creates an instance of PRWrapper and registers all the relevant functions of the decorated class or the decorated function in the wrapper.
Args
what
- the class or function to decorate.
Returns
modified class or function
Expand source code
def _pr_decorator(what): """ This is the decorator to identify a class or function as a processing resource. This is made available with the name PR in the gatenlp package. This creates an instance of PRWrapper and registers all the relevant functions of the decorated class or the decorated function in the wrapper. Args: what: the class or function to decorate. Returns: modified class or function """ gatenlp.gate_python_plugin_pr = "The PR from here!!!" wrapper = _PrWrapper() if inspect.isclass(what) or _has_method(what, "__call__"): if inspect.isclass(what): what = ( what() ) # if it is a class, create an instance, otherwise assume it is already an instance # TODO: instead of this we could just as well store the instance and # directly call the instance methods from the wrapper! execmethod = _has_method(what, "__call__") if not execmethod: raise Exception("PR does not have a __call__(doc) method.") allowkws = _check_exec(execmethod) wrapper.func_execute_allowkws = allowkws wrapper.func_execute = execmethod startmethod = _has_method(what, "start") if startmethod: wrapper.func_start = startmethod if inspect.getfullargspec(startmethod).varkw: wrapper.func_start_allowkws = True finishmethod = _has_method(what, "finish") if finishmethod: wrapper.func_finish = finishmethod if inspect.getfullargspec(finishmethod).varkw: wrapper.func_finish_allowkws = True reducemethod = _has_method(what, "reduce") if reducemethod: wrapper.func_reduce = reducemethod if inspect.getfullargspec(reducemethod).varkw: wrapper.func_reduce_allowkws = True elif inspect.isfunction(what): allowkws = _check_exec(what) wrapper.func_execute = what wrapper.func_execute_allowkws = allowkws else: raise Exception( f"Decorator applied to something that is not a function or class: {what}" ) gatenlp.gate_python_plugin_pr = wrapper return wrapper
Classes
class Annotation (start: int, end: int, anntype: str, features=None, annid: int = 0)
-
An annotation represents information about a span of text. It contains the start and end offsets of the span, an "annotation type" and an arbitrary number of features.
In addition it contains an id which has no meaning for the annotation itself but is used to uniquely identify an annotation within the set it is contained in.
All fields except the features are immutable, once the annotation has been created only the features can be changed.
This constructor creates a new annotation instance. Once an annotation has been created, the start, end, type and id fields cannot be changed.
NOTE: this should almost never be done directly and instead the method AnnotationSet.add should be used.
Args
start
- start offset of the annotation
end
- end offset of the annotation
anntype
- annotation type
features
- an initial collection of features, None for no features.
annid
- the id of the annotation
Expand source code
class Annotation: """ An annotation represents information about a span of text. It contains the start and end offsets of the span, an "annotation type" and an arbitrary number of features. In addition it contains an id which has no meaning for the annotation itself but is used to uniquely identify an annotation within the set it is contained in. All fields except the features are immutable, once the annotation has been created only the features can be changed. """ @allowspan def __init__( self, start: int, end: int, anntype: str, features=None, annid: int = 0 ): """ This constructor creates a new annotation instance. Once an annotation has been created, the start, end, type and id fields cannot be changed. NOTE: this should almost never be done directly and instead the method AnnotationSet.add should be used. Args: start: start offset of the annotation end: end offset of the annotation anntype: annotation type features: an initial collection of features, None for no features. annid: the id of the annotation """ if end < start: raise Exception( f"Cannot create annotation start={start}, end={end}, type={anntype}, id={annid}, features={features}: start > end" ) if not isinstance(annid, int): raise Exception( f"Cannot create annotation start={start}, end={end}, type={anntype}, id={annid}, features={features}: annid is not an int" ) if isinstance(features, int): raise Exception( f"Cannot create annotation start={start}, end={end}, type={anntype}, id={annid}, features={features}: features must not be an int" ) # super().__init__(features) if annid is not None and not isinstance(annid, int): raise Exception("Parameter annid must be an int, mixed up with features?") if features is not None and isinstance(features, int): raise Exception( "Parameter features must not be an int: mixed up with annid?" ) self._owner_set = None self._features = Features(features, logger=self._log_feature_change) self._type = anntype self._start = start self._end = end self._id = annid @property def type(self) -> str: """ Returns the annotation type. """ return self._type @property def start(self) -> int: """ Returns the start offset. """ return self._start @property def end(self): """ Returns the end offset. """ return self._end @property def features(self): """ Returns the features for the annotation. """ return self._features @property def id(self): """ Returns the annotation id. """ return self._id @property def span(self) -> Span: """ Returns a tuple with the start and end offset of the annotation. """ return Span(self._start, self._end) def _changelog(self): if self._owner_set is not None: return self._owner_set.changelog # TODO: for now at least, make sure only simple JSON serialisable things are used! We do NOT # allow any user specific types in order to make sure what we create is interchangeable with GATE. # In addition we do NOT allow None features. # So a feature name always has to be a string (not None), the value has to be anything that is json # serialisable (except None keys for maps). # For performance reasons we check the feature name but not the value (maybe make checking optional # on by default but still optional?) def _log_feature_change( self, command: str, feature: str = None, value=None ) -> None: """ Args: command: str: feature: str: (Default value = None) value: (Default value = None) Returns: """ if self._changelog() is None: return command = "ann-" + command ch = { "command": command, "type": "annotation", "set": self._owner_set.name, "id": self.id, } if feature is not None: ch["feature"] = feature if value is not None: ch["value"] = value self._changelog().append(ch) def __eq__(self, other) -> bool: """ Two annotations are identical if they are the same object or if all the fields are equal (including the annotation id)! """ if not isinstance(other, Annotation): return False if self is other: return True return ( self.start == other.start and self.end == other.end and self.type == other.type and self.id == other.id and self._features == other._features ) def __hash__(self): """ The hash depends on the annotation ID and the owning set. """ return hash((self.id, self._owner_set)) def __lt__(self, other) -> bool: """ Comparison for sorting: this sorts by increasing start offset, then increasing annotation id. Since annotation ids within a set are unique, this guarantees a unique order of annotations that come from an annotation set. Note: for now the other object has to be an instance of Annotation, duck typing is not supported! """ if not isinstance(other, Annotation): raise Exception("Cannot compare to non-Annotation") if self.start < other.start: return True elif self.start > other.start: return False else: return self.id < other.id def __repr__(self) -> str: """ String representation of the annotation. """ return "Annotation({},{},{},features={},id={})".format( self.start, self.end, self.type, self._features, self.id ) @property def length(self) -> int: """ Returns the length of the annotation: this is the length of the offset span. Since the end offset is one after the last element, we return end-start. Note: this is deliberately not implemented as len(ann), as len(annset) returns the number of annotations in the set but annset.length() also returns the span length of the annotation set, so the method name for this is identical between annotations and annotation sets. """ return self.end - self.start @support_annotation_or_set def isoverlapping(self, start: int, end: int) -> bool: """ Checks if this annotation is overlapping with the given span, annotation or annotation set. An annotation is overlapping with a span if the first or last character is inside that span. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span Returns: `True` if overlapping, `False` otherwise """ return self.iscovering(start) or self.iscovering(end - 1) @support_annotation_or_set def iscoextensive(self, start: int, end: int) -> bool: """ Checks if this annotation is coextensive with the given span, annotation or annotation set, i.e. has exactly the same start and end offsets. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span Returns: `True` if coextensive, `False` otherwise """ return self.start == start and self.end == end @support_annotation_or_set def iswithin(self, start: int, end: int) -> bool: """ Checks if this annotation is within the given span, annotation or annotation set, i.e. both the start and end offsets of this annotation are after the given start and before the given end. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span Returns: `True` if within, `False` otherwise """ return start <= self.start and end >= self.end @support_annotation_or_set def isbefore(self, start: int, end: int, immediately=False) -> bool: """ Checks if this annotation is before the other span, i.e. the end of this annotation is before the start of the other annotation or span. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span immediately: if true checks if this annotation ends immediately before the other one (Default value = False) Returns: True if before, False otherwise """ if immediately: return self.end == start else: return self.end <= start @support_annotation_or_set def isafter(self, start: int, end: int, immediately=False) -> bool: """Checks if this annotation is after the other span, i.e. the start of this annotation is after the end of the other annotation or span. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span immediately: if true checks if this annotation starts immediately after the other one (Default value = False) Returns: True if after, False otherwise """ if immediately: return self.start == end else: return self.start >= end @support_annotation_or_set def gap(self, start: int, end: int): """Return the gep between this annotation and the other annotation. This is the distance between the last character of the first annotation and the first character of the second annotation in sequence, so it is always independent of the order of the two annotations. This is negative if the annotations overlap. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of span end: end offset of span Returns: size of gap """ if self.start < start: ann1start = self.start ann1end = self.end ann2start = start ann2end = end else: ann2start = self.start ann2end = self.end ann1start = start ann1end = end return ann2start - ann1end @support_annotation_or_set def iscovering(self, start: int, end: int = None) -> bool: """Checks if this annotation is covering the given span, annotation or annotation set, i.e. both the given start and end offsets are after the start of this annotation and before the end of this annotation. If end is not given, then the method checks if start is an offset of a character contained in the span. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span Returns: True if covering, False otherwise """ if end is None: return self.start <= start < self.end else: return self.start <= start and self.end >= end def to_dict(self, offset_mapper=None, offset_type=None, **kwargs): """ Return a representation of this annotation as a nested map. This representation is used for several serialization methods. Args: offset_mapper: the offset mapper to use, must be specified if `offset_type` is specified. offset_type: the offset type to be used for the conversionm must be specified if `offset_mapper` is specified Returns: the dictionary representation of the Annotation """ if (offset_mapper and not offset_type) or (not offset_mapper and offset_type): raise Exception( "offset_mapper and offset_type must be specified both or none" ) if offset_mapper is not None: if offset_type == OFFSET_TYPE_JAVA: start = offset_mapper.convert_to_java(self._start) end = offset_mapper.convert_to_java(self._end) elif offset_type == OFFSET_TYPE_PYTHON: start = offset_mapper.convert_to_python(self._start) end = offset_mapper.convert_to_python(self._end) else: raise Exception( f"Not a valid offset type: {offset_type}, must be 'p' or 'j'" ) else: start = self._start end = self._end return { "type": self.type, "start": start, "end": end, "id": self.id, "features": self._features.to_dict(), } @staticmethod def from_dict(dictrepr, owner_set=None, **kwargs): """ Construct an annotation object from the dictionary representation. Args: dictrepr: dictionary representation owner_set: the owning set the annotation should have (Default value = None) kwargs: ignored """ ann = Annotation( dictrepr.get("start"), dictrepr.get("end"), dictrepr.get("type"), annid=dictrepr.get("id"), features=dictrepr.get("features"), ) ann._owner_set = owner_set return ann def __copy__(self): return Annotation( self._start, self._end, self._type, annid=self._id, features=self._features ) def copy(self): """ Return a shallow copy of the annotation (features are shared). """ return self.__copy__() def __deepcopy__(self, memo=None): if self._features is not None: fts = lib_copy.deepcopy(self._features.to_dict(), memo=memo) else: fts = None return Annotation( self._start, self._end, self._type, annid=self._id, features=fts ) def deepcopy(self, memo=None): """ Return a deep copy of the annotation (features and their values are copied as well). """ return lib_copy.deepcopy(self, memo=memo)
Static methods
def from_dict(dictrepr, owner_set=None, **kwargs)
-
Construct an annotation object from the dictionary representation.
Args
dictrepr
- dictionary representation
owner_set
- the owning set the annotation should have (Default value = None)
kwargs
- ignored
Expand source code
@staticmethod def from_dict(dictrepr, owner_set=None, **kwargs): """ Construct an annotation object from the dictionary representation. Args: dictrepr: dictionary representation owner_set: the owning set the annotation should have (Default value = None) kwargs: ignored """ ann = Annotation( dictrepr.get("start"), dictrepr.get("end"), dictrepr.get("type"), annid=dictrepr.get("id"), features=dictrepr.get("features"), ) ann._owner_set = owner_set return ann
Instance variables
property/get end
-
Returns the end offset.
Expand source code
@property def end(self): """ Returns the end offset. """ return self._end
property/get features
-
Returns the features for the annotation.
Expand source code
@property def features(self): """ Returns the features for the annotation. """ return self._features
property/get id
-
Returns the annotation id.
Expand source code
@property def id(self): """ Returns the annotation id. """ return self._id
property/get length : int
-
Returns the length of the annotation: this is the length of the offset span. Since the end offset is one after the last element, we return end-start. Note: this is deliberately not implemented as len(ann), as len(annset) returns the number of annotations in the set but annset.length() also returns the span length of the annotation set, so the method name for this is identical between annotations and annotation sets.
Expand source code
@property def length(self) -> int: """ Returns the length of the annotation: this is the length of the offset span. Since the end offset is one after the last element, we return end-start. Note: this is deliberately not implemented as len(ann), as len(annset) returns the number of annotations in the set but annset.length() also returns the span length of the annotation set, so the method name for this is identical between annotations and annotation sets. """ return self.end - self.start
property/get span : Span
-
Returns a tuple with the start and end offset of the annotation.
Expand source code
@property def span(self) -> Span: """ Returns a tuple with the start and end offset of the annotation. """ return Span(self._start, self._end)
property/get start : int
-
Returns the start offset.
Expand source code
@property def start(self) -> int: """ Returns the start offset. """ return self._start
property/get type : str
-
Returns the annotation type.
Expand source code
@property def type(self) -> str: """ Returns the annotation type. """ return self._type
Methods
def copy(self)
-
Return a shallow copy of the annotation (features are shared).
Expand source code
def copy(self): """ Return a shallow copy of the annotation (features are shared). """ return self.__copy__()
def deepcopy(self, memo=None)
-
Return a deep copy of the annotation (features and their values are copied as well).
Expand source code
def deepcopy(self, memo=None): """ Return a deep copy of the annotation (features and their values are copied as well). """ return lib_copy.deepcopy(self, memo=memo)
def gap(self, start: int, end: int)
-
Return the gep between this annotation and the other annotation. This is the distance between the last character of the first annotation and the first character of the second annotation in sequence, so it is always independent of the order of the two annotations.
This is negative if the annotations overlap.
Note: this can be called with an Annotation or AnnotationSet instead of
start
andend
(see gatenlp._utils.support_annotation_or_set)Args
start
- start offset of span
end
- end offset of span
Returns
size of gap
Expand source code
@support_annotation_or_set def gap(self, start: int, end: int): """Return the gep between this annotation and the other annotation. This is the distance between the last character of the first annotation and the first character of the second annotation in sequence, so it is always independent of the order of the two annotations. This is negative if the annotations overlap. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of span end: end offset of span Returns: size of gap """ if self.start < start: ann1start = self.start ann1end = self.end ann2start = start ann2end = end else: ann2start = self.start ann2end = self.end ann1start = start ann1end = end return ann2start - ann1end
def isafter(self, start: int, end: int, immediately=False) ‑> bool
-
Checks if this annotation is after the other span, i.e. the start of this annotation is after the end of the other annotation or span.
Note: this can be called with an Annotation or AnnotationSet instead of
start
andend
(see gatenlp._utils.support_annotation_or_set)Args
start
- start offset of the span
end
- end offset of the span
immediately
- if true checks if this annotation starts immediately after the other one (Default value = False)
Returns
True if after, False otherwise
Expand source code
@support_annotation_or_set def isafter(self, start: int, end: int, immediately=False) -> bool: """Checks if this annotation is after the other span, i.e. the start of this annotation is after the end of the other annotation or span. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span immediately: if true checks if this annotation starts immediately after the other one (Default value = False) Returns: True if after, False otherwise """ if immediately: return self.start == end else: return self.start >= end
def isbefore(self, start: int, end: int, immediately=False) ‑> bool
-
Checks if this annotation is before the other span, i.e. the end of this annotation is before the start of the other annotation or span.
Note: this can be called with an Annotation or AnnotationSet instead of
start
andend
(see gatenlp._utils.support_annotation_or_set)Args
start
- start offset of the span
end
- end offset of the span
immediately
- if true checks if this annotation ends immediately before the other one (Default value = False)
Returns
True if before, False otherwise
Expand source code
@support_annotation_or_set def isbefore(self, start: int, end: int, immediately=False) -> bool: """ Checks if this annotation is before the other span, i.e. the end of this annotation is before the start of the other annotation or span. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span immediately: if true checks if this annotation ends immediately before the other one (Default value = False) Returns: True if before, False otherwise """ if immediately: return self.end == start else: return self.end <= start
def iscoextensive(self, start: int, end: int) ‑> bool
-
Checks if this annotation is coextensive with the given span, annotation or annotation set, i.e. has exactly the same start and end offsets.
Note: this can be called with an Annotation or AnnotationSet instead of
start
andend
(see gatenlp._utils.support_annotation_or_set)Args
start
- start offset of the span
end
- end offset of the span
Returns
True
if coextensive,False
otherwiseExpand source code
@support_annotation_or_set def iscoextensive(self, start: int, end: int) -> bool: """ Checks if this annotation is coextensive with the given span, annotation or annotation set, i.e. has exactly the same start and end offsets. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span Returns: `True` if coextensive, `False` otherwise """ return self.start == start and self.end == end
def iscovering(self, start: int, end: int = None) ‑> bool
-
Checks if this annotation is covering the given span, annotation or annotation set, i.e. both the given start and end offsets are after the start of this annotation and before the end of this annotation.
If end is not given, then the method checks if start is an offset of a character contained in the span.
Note: this can be called with an Annotation or AnnotationSet instead of
start
andend
(see gatenlp._utils.support_annotation_or_set)Args
start
- start offset of the span
end
- end offset of the span
Returns
True if covering, False otherwise
Expand source code
@support_annotation_or_set def iscovering(self, start: int, end: int = None) -> bool: """Checks if this annotation is covering the given span, annotation or annotation set, i.e. both the given start and end offsets are after the start of this annotation and before the end of this annotation. If end is not given, then the method checks if start is an offset of a character contained in the span. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span Returns: True if covering, False otherwise """ if end is None: return self.start <= start < self.end else: return self.start <= start and self.end >= end
def isoverlapping(self, start: int, end: int) ‑> bool
-
Checks if this annotation is overlapping with the given span, annotation or annotation set. An annotation is overlapping with a span if the first or last character is inside that span.
Note: this can be called with an Annotation or AnnotationSet instead of
start
andend
(see gatenlp._utils.support_annotation_or_set)Args
start
- start offset of the span
end
- end offset of the span
Returns
True
if overlapping,False
otherwiseExpand source code
@support_annotation_or_set def isoverlapping(self, start: int, end: int) -> bool: """ Checks if this annotation is overlapping with the given span, annotation or annotation set. An annotation is overlapping with a span if the first or last character is inside that span. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span Returns: `True` if overlapping, `False` otherwise """ return self.iscovering(start) or self.iscovering(end - 1)
def iswithin(self, start: int, end: int) ‑> bool
-
Checks if this annotation is within the given span, annotation or annotation set, i.e. both the start and end offsets of this annotation are after the given start and before the given end.
Note: this can be called with an Annotation or AnnotationSet instead of
start
andend
(see gatenlp._utils.support_annotation_or_set)Args
start
- start offset of the span
end
- end offset of the span
Returns
True
if within,False
otherwiseExpand source code
@support_annotation_or_set def iswithin(self, start: int, end: int) -> bool: """ Checks if this annotation is within the given span, annotation or annotation set, i.e. both the start and end offsets of this annotation are after the given start and before the given end. Note: this can be called with an Annotation or AnnotationSet instead of `start` and `end` (see gatenlp._utils.support_annotation_or_set) Args: start: start offset of the span end: end offset of the span Returns: `True` if within, `False` otherwise """ return start <= self.start and end >= self.end
def to_dict(self, offset_mapper=None, offset_type=None, **kwargs)
-
Return a representation of this annotation as a nested map. This representation is used for several serialization methods.
Args
offset_mapper
- the offset mapper to use, must be specified if
offset_type
is specified. offset_type
- the offset type to be used for the conversionm must be specified if
offset_mapper
is specified
Returns
the dictionary representation of the Annotation
Expand source code
def to_dict(self, offset_mapper=None, offset_type=None, **kwargs): """ Return a representation of this annotation as a nested map. This representation is used for several serialization methods. Args: offset_mapper: the offset mapper to use, must be specified if `offset_type` is specified. offset_type: the offset type to be used for the conversionm must be specified if `offset_mapper` is specified Returns: the dictionary representation of the Annotation """ if (offset_mapper and not offset_type) or (not offset_mapper and offset_type): raise Exception( "offset_mapper and offset_type must be specified both or none" ) if offset_mapper is not None: if offset_type == OFFSET_TYPE_JAVA: start = offset_mapper.convert_to_java(self._start) end = offset_mapper.convert_to_java(self._end) elif offset_type == OFFSET_TYPE_PYTHON: start = offset_mapper.convert_to_python(self._start) end = offset_mapper.convert_to_python(self._end) else: raise Exception( f"Not a valid offset type: {offset_type}, must be 'p' or 'j'" ) else: start = self._start end = self._end return { "type": self.type, "start": start, "end": end, "id": self.id, "features": self._features.to_dict(), }
class AnnotationSet (name: str = '', owner_doc: Document = None)
-
Creates an annotation set. This should not be used directly by the user, instead the method
Document.annset()(name)
should be used to access the annotation set with a given name from the document.An annotation set contains an arbitrary number of annotations, which can overlap in arbitrary ways. Each annotation set has a name and a document can have as many named annotation sets as needed.
Args
name
- the name of the annotation set, default: the empty string (default annotation set)
owner_doc
- if this is set, the set and all sets created from it can be queried for the owning document and offsets get checked against the text of the owning document, if it has text. Also, the changelog is only updated if an annotation set has an owning document.
Expand source code
class AnnotationSet: def __init__(self, name: str = "", owner_doc: "Document" = None): """ Creates an annotation set. This should not be used directly by the user, instead the method `Document.annset(name)` should be used to access the annotation set with a given name from the document. An annotation set contains an arbitrary number of annotations, which can overlap in arbitrary ways. Each annotation set has a name and a document can have as many named annotation sets as needed. Args: name: the name of the annotation set, default: the empty string (default annotation set) owner_doc: if this is set, the set and all sets created from it can be queried for the owning document and offsets get checked against the text of the owning document, if it has text. Also, the changelog is only updated if an annotation set has an owning document. """ # print("CREATING annotation set {} with changelog {} ".format(name, changelog), file=sys.stderr) self._name = name self._owner_doc = owner_doc self._index_by_offset = None self._index_by_type = None # internally we represent the annotations as a map from annotation id (int) to Annotation self._annotations = {} self._is_immutable = False self._next_annid = 0 @property def name(self): """ Returns the name of the annotation set. Note: the name of a set cannot be changed. """ return self._name @property def changelog(self): """ Returns the changelog or None if no changelog is set. """ if self._owner_doc is None: return None return self._owner_doc.changelog def __setattr__(self, key, value): """ Prevent immutable fields from getting overridden, once they have been set. """ if key == "name" or key == "owner_doc": if self.__dict__.get(key, None) is None: super().__setattr__(key, value) else: raise Exception( "AnnotationSet attribute cannot get changed after being set" ) else: super().__setattr__(key, value) def detach(self, restrict_to=None) -> "AnnotationSet": """ Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids. A detached annotation set does not have an owning document and deleting or adding annotations does not change the annotations stored with the document. However, the annotations in a detached annotation set are the same as those stored in the attached set, so updating their features will modify the annotations in the document as well. Args: restrict_to: an iterable of annotation ids, if None, all the annotations from this set. Returns: an immutable annotation set """ annset = AnnotationSet(name="detached-from:" + self.name) annset._is_immutable = True if restrict_to is None: annset._annotations = { annid: self._annotations[annid] for annid in self._annotations.keys() } else: annset._annotations = { annid: self._annotations[annid] for annid in restrict_to } annset._next_annid = self._next_annid return annset def detach_from(self, anns: Iterable) -> "AnnotationSet": """ Creates an immutable detached annotation set from the annotations in anns which could by either a collection of annotations or annotation ids (int numbers) which are assumed to be the annotation ids from this set. The next annotation id for the created set is the highest seen annotation id from anns plus one. Args: anns: an iterable of annotations Returns: an immutable detached annotation set """ annset = AnnotationSet(name="detached-from:" + self.name) annset._is_immutable = True annset._annotations = {} nextid = -1 for ann in anns: if isinstance(ann, int): annset._annotations[ann] = self._annotations[ann] annid = ann else: annset._annotations[id] = ann annid = ann.id if annid > nextid: nextid = annid annset._next_annid = nextid + 1 return annset @property def immutable(self) -> bool: """ Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added or removed from the set, but the annotations themselves can still have their features modified. All detached annotation sets are immutable when created, but can be made mutable afterwards. """ return self._is_immutable @immutable.setter def immutable(self, val: bool) -> None: self._is_immutable = val def isdetached(self) -> bool: """ Returns True if the annotation set is detached, False otherwise. """ return self._owner_doc is None def _create_index_by_offset(self) -> None: """ Generates the offset index, if it does not already exist. The offset index is an interval tree that stores the annotation ids for the offset interval of the annotation. """ if self._index_by_offset is None: self._index_by_offset = SortedIntvls() for ann in self._annotations.values(): self._index_by_offset.add(ann.start, ann.end, ann.id) def _create_index_by_type(self) -> None: """Generates the type index, if it does not already exist. The type index is a map from annotation type to a set of all annotation ids with that type. """ if self._index_by_type is None: self._index_by_type = defaultdict(set) for ann in self._annotations.values(): self._index_by_type[ann.type].add(ann.id) def _add_to_indices(self, annotation: Annotation) -> None: """ If we have created the indices, add the annotation to them. Args: annotation: the annotation to add to the indices. annotation: Annotation: """ if self._index_by_type is not None: self._index_by_type[annotation.type].add(annotation.id) if self._index_by_offset is not None: self._index_by_offset.add(annotation.start, annotation.end, annotation.id) def _remove_from_indices(self, annotation: Annotation) -> None: """Remove an annotation from the indices. Args: annotation: the annotation to remove. annotation: Annotation: """ if self._index_by_offset is not None: self._index_by_offset.remove( annotation.start, annotation.end, annotation.id ) if self._index_by_type is not None: self._index_by_type[annotation.type].remove(annotation.id) @staticmethod def _intvs2idlist(intvs, ignore=None) -> List[int]: """Convert an iterable of interval tuples (start, end, id) to a list of ids Args: intvs: iterable of interval tuples ignore: an optional annotation id that should not get included in the result (Default value = None) Returns: list of ids """ if ignore is not None: return [i[2] for i in intvs if i[2] != ignore] else: return [i[2] for i in intvs] @staticmethod def _intvs2idset(intvs, ignore=None) -> Set[int]: """Convert an iterable of interval tuples (start, end, id) to a set of ids Args: intvs: iterable of interval tuples ignore: (Default value = None) Returns: set of ids """ ret = set() if ignore is not None: for i in intvs: if i[2] != ignore: ret.add(i[2]) else: for i in intvs: ret.add(i[2]) return ret def _restrict_intvs(self, intvs, ignore=None) -> "AnnotationSet": """ Args: intvs: ignore: (Default value = None) Returns: """ return self.detach( restrict_to=AnnotationSet._intvs2idlist(intvs, ignore=ignore) ) def __len__(self) -> int: """ Return number of annotations in the set. :return: number of annotations """ return len(self._annotations) @property def size(self) -> int: """ Returns the number of annotations in the annotation set. """ return len(self._annotations) @property def document(self) -> Union["Document", None]: """ Returns the owning document, if set. If the owning document was not set, returns None. """ return self._owner_doc @support_annotation_or_set def _check_offsets(self, start: int, end: int, annid=None) -> None: """ Checks the offsets for the given span/annotation against the document boundaries, if we know the owning document and if the owning document has text. Args: start: int: end: int: annid: (Default value = None) """ if self._owner_doc is None: return if self._owner_doc.text is None: return doc_size = len(self._owner_doc) if start < 0: raise InvalidOffsetError("Annotation starts before 0") if end < 0: raise InvalidOffsetError("Annotation ends before 0") if start > end: raise InvalidOffsetError("Annotation ends before it starts") if start > doc_size: raise InvalidOffsetError( "Annotation starts after document ends: start={}, docsize={}".format( start, doc_size ) ) if end > doc_size: raise InvalidOffsetError( "Annotation ends after document ends: end={}, docsize={}".format( end, doc_size ) ) @property def start(self): """ Returns the smallest start offset of all annotations, i.e the start of the span of the whole set. This needs the index and creates it if necessary. Throws: an exception if there are no annotations in the set. """ if self.size == 0: raise Exception("Annotation set is empty, cannot determine start offset") self._create_index_by_offset() return self._index_by_offset.min_start() @property def end(self): """ Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation. This needs the index and creates it if necessary. Throws: an exception if there are no annotations in the set. """ if self.size == 0: raise Exception("Annotation set is empty, cannot determine end offset") self._create_index_by_offset() return self._index_by_offset.max_end() @property def length(self): """ Returns the the length of the annotation set span. Throws: an exception if there are no annotations in the set. """ return self.end() - self.start() @allowspan def add( self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None, ): """ Adds an annotation to the set. Once an annotation has been added, the start and end offsets, the type, and the annotation id of the annotation are immutable. Args: start: start offset end: end offset anntype: the annotation type features: a map, an iterable of tuples or an existing feature map. In any case, the features are used to create a new feature map for this annotation. If the map is empty or this parameter is None, the annotation does not store any map at all. annid: the annotation id, if not specified the next free one for this set is used. NOTE: the id should normally left unspecified and get assigned automatically. Returns: the new annotation """ if annid is not None and not isinstance(annid, int): raise Exception("Parameter annid must be an int, mixed up with features?") if features is not None and isinstance(features, int): raise Exception( "Parameter features must not be an int: mixed up with annid?" ) if self._is_immutable: raise Exception("Cannot add an annotation to an immutable annotation set") self._check_offsets(start, end) if annid and annid in self._annotations: raise Exception( "Cannot add annotation with id {}, already in set".format(annid) ) if annid is None: annid = self._next_annid self._next_annid = self._next_annid + 1 ann = Annotation(start, end, anntype, features=features, annid=annid) ann._owner_set = self if not self._annotations: self._annotations = {} self._annotations[annid] = ann self._add_to_indices(ann) if self.changelog is not None: entry = { "command": "annotation:add", "set": self.name, "start": ann.start, "end": ann.end, "type": ann.type, "features": ann._features.to_dict(), "id": ann.id, } self.changelog.append(entry) return ann def add_ann(self, ann, annid: int = None): """ Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or with the one given. Args: ann: the annotation to copy into the set annid: the annotation id, if not specified the next free one for this set is used. Note: the id should normally left unspecified and get assigned automatically. Returns: the added annotation """ return self.add(ann.start, ann.end, ann.type, ann.features, annid=annid) def remove( self, annoriter: Union[int, Annotation, Iterable], raise_on_notexisting=True ) -> None: """ Removes the given annotation which is either the id or the annotation instance or recursively all annotations in the iterable. Throws: exception if the annotation set is immutable or the annotation is not in the set Args: annoriter: either the id (int) or the annotation instance (Annotation) or an iterable of id or annotation instance or iterable ... raise_on_notexisting: (default: True) if false, silently accepts non-existing annotations/ids and does nothing. Note: if this is True, but the annotation set is immutable, an Exception is still raised. """ if self._is_immutable: raise Exception( "Cannot remove an annotation from an immutable annotation set" ) if isinstance(annoriter, Iterable): for a in annoriter: self.remove(a, raise_on_notexisting=raise_on_notexisting) return annid = None # make pycharm happy if isinstance(annoriter, int): annid = annoriter if annid not in self._annotations: raise Exception( "Annotation with id {} not in annotation set, cannot remove".format( annid ) ) annoriter = self._annotations[annid] elif isinstance(annoriter, Annotation): annid = annoriter.id if annid not in self._annotations: raise Exception( "Annotation with id {} does not belong to this set, cannot remove".format( annid ) ) # NOTE: once the annotation has been removed from the set, it could still be referenced # somewhere else and its features could get modified. In order to prevent logging of such changes, # the owning set gets cleared for the annotation annoriter._owner_set = None del self._annotations[annid] if self.changelog is not None: self.changelog.append( {"command": "annotation:remove", "set": self.name, "id": annid} ) self._remove_from_indices(annoriter) def clear(self) -> None: """ Removes all annotations from the set. """ self._annotations.clear() self._index_by_offset = None self._index_by_type = None if self.changelog is not None: self.changelog.append({"command": "annotations:clear", "set": self.name}) def clone_anns(self, memo=None): """ Replaces the annotations in this set with deep copies of the originals. If this is a detached set, then this makes sure that any modifications to the annotations do not affect the original annotations in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect the annotations in this set any more. The owning set of the annotations that get cloned is cleared. Args: memo: for internal use by our __deepcopy__ implementation. """ tmpdict = {} for annid, ann in self._annotations.items(): newann = copy.deepcopy(ann, memo=memo) ann._owner_set = None tmpdict[annid] = newann for annid, ann in tmpdict.items(): self._annotations[annid] = ann def __copy__(self): """ NOTE: creating a copy always creates a detached set, but a mutable one. """ c = self.detach() c._is_immutable = False return c def copy(self): """ Returns a shallow copy of the annotation set. """ return self.__copy__() def __deepcopy__(self, memo=None): if memo is None: memo = {} c = self.detach() c._is_immutable = False c.clone_anns(memo=memo) return c def deepcopy(self): """ Returns a deep copy of the annotation set. """ return copy.deepcopy(self) def __iter__(self) -> Iterator: """ Yields all the annotations of the set. Important: using the iterator will always create the index if it is not already there! For fast iteration use fast_iter() which does not allow sorting or offset ranges. Yields: the annotations in document order """ # return iter(self._annotations.values()) return self.iter() def fast_iter(self) -> Generator: """ Yields annotations in insertion order. This is faster then the default iterator and does not need to index (so if the index does not exist, it will not be built). """ if self._annotations: for annid, ann in self._annotations.items(): yield ann def iter( self, start_ge: Union[int, None] = None, start_lt: Union[None, int] = None, with_type: str = None, reverse: bool = False, ) -> Generator: """ Yields annotations in document order, otionally limited by the other parameters. If two annoations start at the same offset, they are always ordered by increasing annotation id. Args: start_ge: the offset from where to start including annotations start_lt: the last offset to use as the starting offset of an annotation with_type: only annotations of this type reverse: process in reverse document order Yields: annotations in document order """ if with_type is not None: allowedtypes = set() if isinstance(type, str): allowedtypes.add(with_type) else: for atype in with_type: allowedtypes.add(atype) else: allowedtypes = None if not self._annotations: return maxoff = None if start_ge is not None: assert start_ge >= 0 if start_lt is not None: assert start_lt >= 1 maxoff = start_lt + 1 if start_lt is not None and start_ge is not None: assert start_lt > start_ge self._create_index_by_offset() for _start, _end, annid in self._index_by_offset.irange( minoff=start_ge, maxoff=maxoff, reverse=reverse ): if ( allowedtypes is not None and self._annotations[annid].type not in allowedtypes ): continue yield self._annotations[annid] def reverse_iter(self, **kwargs): """ Same as iter, but with the reverse parameter set to true. Args: kwargs: Same as for iter(), with revers=True fixed. **kwargs: will get passed on the Annotation.iter Returns: same result as iter() """ return self.iter(reverse=True, **kwargs) def get( self, annid: Union[int, Annotation], default=None ) -> Union[Annotation, None]: """Gets the annotation with the given annotation id or returns the given default. NOTE: for handling cases where legacy code still expects the add method to return an id and not the annotation, this will accept an annotation so the the frequent pattern still works: annid = annset.add(b,e,t).id ann = annset.get(annid) If an annotation is passed the annotation from the set with the id of that annotation is returned, if the annotation is from that set, this will return the same object, if it is still in the set (or return the default value). Args: annid: the annotation id of the annotation to retrieve. default: what to return if an annotation with the given id is not found. (Default value = None) annid: Union[int: Annotation]: Returns: the annotation or the default value. """ if isinstance(annid, Annotation): annid = annid.id return self._annotations.get(annid, default) def first(self): """ Args: Returns: :return: first annotation """ sz = len(self._annotations) if sz == 0: raise Exception("Empty set, there is no first annotation") elif sz == 1: return next(iter(self._annotations.values())) self._create_index_by_offset() _, _, annid = next(self._index_by_offset.irange(reverse=False)) return self._annotations[annid] def last(self): """ Args: Returns: :return: first annotation """ sz = len(self._annotations) if sz == 0: raise Exception("Empty set, there is no last annotation") elif sz == 1: return next(iter(self._annotations.values())) self._create_index_by_offset() _, _, annid = next(self._index_by_offset.irange(reverse=True)) return self._annotations[annid] def __getitem__(self, annid): """ Gets the annotation with the given annotation id or throws an exception. Args: annid: the annotation id Returns: annotation """ return self._annotations[annid] def with_type( self, *anntype: Union[str, Iterable], non_overlapping: bool = False ) -> "AnnotationSet": """ Gets annotations of the specified type(s). Creates the type index if necessary. Args: anntype: one or more types or type lists. The union of all types specified that way is used to filter the annotations. If no type is specified, all annotations are selected. non_overlapping: if True, only return annotations of any of the given types which do not overlap with other annotations. If there are several annotations that start at the same offset, use the type that comes first in the parameters, if there are more than one of that type, use the one that would come first in the usual sort order. Returns: a detached immutable annotation set with the matching annotations. """ atypes = [] for atype in anntype: if isinstance(atype, str): atypes.append(atype) else: for t in atype: atypes.append(t) if not atypes: return self.detach() self._create_index_by_type() annids = set() for t in atypes: idxs = self._index_by_type.get(t) if idxs: annids.update(idxs) if non_overlapping: # need to get annotations grouped by start offset and sorted according to # what the Annotation class defines allanns = sorted(annids, key=lambda x: self._annotations[x]) allanns = [self._annotations[x] for x in allanns] allannsgrouped = [] curstart = None curset = None for ann in allanns: if curstart is None: curset = [ann] curstart = ann.start elif curstart == ann.start: curset.append(ann) else: allannsgrouped.append(curset) curset = [ann] curstart = ann.start if curset: allannsgrouped.append(curset) retanns = [] # now go through all the grouped annoations and select the top priority one # then skip to the next group that does not overlap with the one we just selected typepriority = dict() for i, atype in enumerate(atypes): typepriority[atype] = len(atypes) - i curminoffset = 0 for group in allannsgrouped: # instead of sorting, go through the group and find the top priority one topann = None if len(group) == 1: if group[0].start >= curminoffset: topann = group[0] elif len(group) == 0: raise Exception("We should never get a 0 size group here!") else: for i, ann in enumerate(group): if ann.start >= curminoffset: topann = ann break for ann in group[i + 1 :]: if ann.start < curminoffset: continue if typepriority[ann.type] > typepriority[topann.type]: topann = ann elif typepriority[ann.type] == typepriority[topann.type]: if ann.end > topann.end: topann = ann elif ann.end == topann.end: if ann.id > topann.id: topann = ann if topann is not None: retanns.append(topann) curminoffset = topann.end annids = [ann.id for ann in retanns] return self.detach(restrict_to=annids) def by_offset(self): """ Yields lists of annotations which start at the same offset. """ self._create_index_by_offset() lastoff = -1 curlist = [] for ann in self.iter(): if ann.start != lastoff: if lastoff != -1: yield curlist lastoff = ann.start curlist = [ann] else: curlist.append(ann) if lastoff != -1: yield curlist def by_span(self): """ Yields list of annotations with identical spans. """ self._create_index_by_offset() lastsoff = -1 lasteoff = -1 curlist = [] for ann in self.iter(): if ann.start != lastsoff or ann.end != lasteoff: if lastsoff != -1: yield curlist lastsoff = ann.start lasteoff = ann.end curlist = [ann] else: curlist.append(ann) if lastsoff != -1: yield curlist @property def type_names(self) -> KeysView[str]: """ Gets the names of all types in this set. Creates the type index if necessary. """ self._create_index_by_type() return self._index_by_type.keys() @support_annotation_or_set def start_eq( self, start: int, ignored: Any = None, annid=None, include_self=False ) -> "AnnotationSet": """ Gets all annotations starting at the given offset (empty if none) and returns them in a detached annotation set. Note: this can be called with an annotation or annotation set instead of the start offset. If called with an annotation, this annotation is not included in the result set if `include_self` is `False` Args: start: the offset where annotations should start ignored: dummy parameter to allow the use of annotations and annotation sets annid: dummy parameter to allow the use of annotations and annotation sets include_self: should annotation passed be included in the result Returns: detached annotation set of matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(start) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore) @support_annotation_or_set def start_min_ge( self, offset: int, ignored: Any = None, annid=None, include_self=False ) -> "AnnotationSet": """Gets all annotations starting at the first possible offset at or after the given offset and returns them in an immutable annotation set. Args: offset: The offset ignored: dummy parameter to allow the use of annotations and annotation sets annid: annotation id include_self: should annotation passed be included in the result Returns: annotation set of matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(offset) # now select only those first ones which all have the same offset if not include_self and annid is not None: ignore = annid else: ignore = None retids = set() startoff = None for intv in intvs: if startoff is None: startoff = intv[0] if ignore is not None: if ignore != intv[2]: retids.add(intv[2]) else: retids.add(intv[2]) elif startoff == intv[0]: if ignore is not None: if ignore != intv[2]: retids.add(intv[2]) else: retids.add(intv[2]) else: break return self.detach(restrict_to=retids) @support_annotation_or_set def start_ge( self, start: int, ignored: Any = None, annid=None, include_self=False ) -> "AnnotationSet": """Return the annotations that start at or after the given start offset. Args: start: Start offset ignored: dummy parameter to allow the use of annotations and annotation sets annid: annotation id include_self: should annotation passed be included in the result Returns: an immutable annotation set of the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(start) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore) @support_annotation_or_set def start_lt(self, offset: int, ignored: Any = None, annid=None) -> "AnnotationSet": """ Returns the annotations that start before the given offset (or annotation). This also accepts an annotation or set. Args: offset: offset before which the annotations should start ignored: dummy parameter to allow the use of annotations and annotation sets annid: annotation id Returns: an immutable annotation set of the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_before(offset) return self._restrict_intvs(intvs) @support_annotation_or_set def overlapping( self, start: int, end: int, annid=None, include_self=False ) -> "AnnotationSet": """ Gets annotations overlapping with the given span. Instead of the start and end offsets, also accepts an annotation or annotation set. For each annotation ann in the result set, ann.overlapping(span) is True Args: start: start offset of the span end: end offset of the span annid: the annotation id of the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.overlapping(start, end) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore) @support_annotation_or_set def covering( self, start: int, end: int, annid=None, include_self=False ) -> "AnnotationSet": """ Gets the annotations which contain the given offset range (or annotation/annotation set), i.e. annotations such that the given offset range is within the annotation. For each annotation ann in the result set, ann.covering(span) is True. Args: start: the start offset of the span end: the end offset of the span annid: the annotation id of the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations, if any """ self._create_index_by_offset() intvs = self._index_by_offset.covering(start, end) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore) @support_annotation_or_set def within( self, start: int, end: int, annid=None, include_self=False ) -> "AnnotationSet": """ Gets annotations that fall completely within the given offset range, i.e. annotations such that the offset range is covering each of the annotation. For each annotation ann in the result set, ann.within(span) is True. Args: start: start offset of the range end: end offset of the range annid: the annotation id of the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations """ if start == end: intvs = [] elif start > end: raise Exception("Invalid offset range: {},{}".format(start, end)) else: self._create_index_by_offset() intvs = self._index_by_offset.within(start, end) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore) @support_annotation_or_set def coextensive( self, start: int, end: int, annid=None, include_self=False ) -> "AnnotationSet": """ Returns a detached annotation set with all annotations that start and end at the given offsets. For each annotation ann in the result set, ann.coextensive(span) is True. Args: start: start offset of the span end: end offset of the span annid: the annotation id of the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. Returns: annotation set with all annotations that have the same start and end offsets. """ self._create_index_by_offset() intvs = self._index_by_offset.at(start, end) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore) @property def span(self) -> Span: """ Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation and the largest end offset of any annotation. (Builds the offset index) """ self._create_index_by_offset() return Span(self._index_by_offset.min_start(), self._index_by_offset.max_end()) def __contains__(self, annorannid: Union[int, Annotation]) -> bool: """ Provides 'annotation in annotation_set' functionality Args: :param annorannid: the annotation instance or annotation id to check Returns: `True` if the annotation exists in the set, `False` otherwise """ if isinstance(annorannid, Annotation): return annorannid.id in self._annotations return ( annorannid in self._annotations ) # On the off chance someone passed an ID in directly contains = __contains__ def __repr__(self) -> str: """ Returns the string representation of the set. """ return "AnnotationSet({})".format(repr(list(self.iter()))) def to_dict(self, anntypes=None, **kwargs): """ Convert an annotation set to its dict representation. Args: anntypes: if not None, an iterable of annotation types to include **kwargs: passed on to the dict creation of contained annotations. Returns: the dict representation of the annotation set. """ if anntypes is not None: anntypesset = set(anntypes) anns_list = list( val.to_dict(**kwargs) for val in self._annotations.values() if val.type in anntypesset ) else: anns_list = list( val.to_dict(**kwargs) for val in self._annotations.values() ) return { # NOTE: Changelog is not getting added as it is stored in the document part! "name": self.name, "annotations": anns_list, "next_annid": self._next_annid, } @staticmethod def from_dict(dictrepr, owner_doc=None, **kwargs): """ Create an AnnotationSet from its dict representation and optionally set the owning document. Args: dictrepr: the dict representation of the annotation set owner_doc: the owning document **kwargs: passed on to the creation of annotations Returns: the annotation set """ annset = AnnotationSet(dictrepr.get("name"), owner_doc=owner_doc) annset._next_annid = dictrepr.get("next_annid") if dictrepr.get("annotations"): annset._annotations = dict( (int(a["id"]), Annotation.from_dict(a, owner_set=annset, **kwargs)) for a in dictrepr.get("annotations") ) else: annset._annotations = {} return annset @staticmethod def from_anns(anns, deep_copy=False, **kwargs): """ Create a detached AnnotationSet from an iterable of annotations. Args: anns: an iterable of annotations deep_copy: if the annotations should get added as copies (default) or deep copies. Returns: the annotation set """ annset = AnnotationSet(name="", owner_doc=None) annset._annotations = dict() maxid = 0 for ann in anns: if deep_copy: addann = ann.deepcopy() else: addann = ann.copy() annset._annotations[addann.id] = addann if addann.id > maxid: maxid = addann.id annset._next_annid = maxid annset._is_immutable = True return annset
Static methods
def from_anns(anns, deep_copy=False, **kwargs)
-
Create a detached AnnotationSet from an iterable of annotations.
Args
anns
- an iterable of annotations
deep_copy
- if the annotations should get added as copies (default) or deep copies.
Returns
the annotation set
Expand source code
@staticmethod def from_anns(anns, deep_copy=False, **kwargs): """ Create a detached AnnotationSet from an iterable of annotations. Args: anns: an iterable of annotations deep_copy: if the annotations should get added as copies (default) or deep copies. Returns: the annotation set """ annset = AnnotationSet(name="", owner_doc=None) annset._annotations = dict() maxid = 0 for ann in anns: if deep_copy: addann = ann.deepcopy() else: addann = ann.copy() annset._annotations[addann.id] = addann if addann.id > maxid: maxid = addann.id annset._next_annid = maxid annset._is_immutable = True return annset
def from_dict(dictrepr, owner_doc=None, **kwargs)
-
Create an AnnotationSet from its dict representation and optionally set the owning document.
Args
dictrepr
- the dict representation of the annotation set
owner_doc
- the owning document
**kwargs
- passed on to the creation of annotations
Returns
the annotation set
Expand source code
@staticmethod def from_dict(dictrepr, owner_doc=None, **kwargs): """ Create an AnnotationSet from its dict representation and optionally set the owning document. Args: dictrepr: the dict representation of the annotation set owner_doc: the owning document **kwargs: passed on to the creation of annotations Returns: the annotation set """ annset = AnnotationSet(dictrepr.get("name"), owner_doc=owner_doc) annset._next_annid = dictrepr.get("next_annid") if dictrepr.get("annotations"): annset._annotations = dict( (int(a["id"]), Annotation.from_dict(a, owner_set=annset, **kwargs)) for a in dictrepr.get("annotations") ) else: annset._annotations = {} return annset
Instance variables
property/get changelog
-
Returns the changelog or None if no changelog is set.
Expand source code
@property def changelog(self): """ Returns the changelog or None if no changelog is set. """ if self._owner_doc is None: return None return self._owner_doc.changelog
property/get document : Union[_ForwardRef('Document'), NoneType]
-
Returns the owning document, if set. If the owning document was not set, returns None.
Expand source code
@property def document(self) -> Union["Document", None]: """ Returns the owning document, if set. If the owning document was not set, returns None. """ return self._owner_doc
property/get end
-
Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation. This needs the index and creates it if necessary.
Throws
an exception if there are no annotations in the set.
Expand source code
@property def end(self): """ Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation. This needs the index and creates it if necessary. Throws: an exception if there are no annotations in the set. """ if self.size == 0: raise Exception("Annotation set is empty, cannot determine end offset") self._create_index_by_offset() return self._index_by_offset.max_end()
property/get/set immutable : bool
-
Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added or removed from the set, but the annotations themselves can still have their features modified.
All detached annotation sets are immutable when created, but can be made mutable afterwards.
Expand source code
@property def immutable(self) -> bool: """ Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added or removed from the set, but the annotations themselves can still have their features modified. All detached annotation sets are immutable when created, but can be made mutable afterwards. """ return self._is_immutable
property/get length
-
Returns the the length of the annotation set span.
Throws
an exception if there are no annotations in the set.
Expand source code
@property def length(self): """ Returns the the length of the annotation set span. Throws: an exception if there are no annotations in the set. """ return self.end() - self.start()
property/get name
-
Returns the name of the annotation set.
Note: the name of a set cannot be changed.
Expand source code
@property def name(self): """ Returns the name of the annotation set. Note: the name of a set cannot be changed. """ return self._name
property/get size : int
-
Returns the number of annotations in the annotation set.
Expand source code
@property def size(self) -> int: """ Returns the number of annotations in the annotation set. """ return len(self._annotations)
property/get span : Span
-
Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation and the largest end offset of any annotation. (Builds the offset index)
Expand source code
@property def span(self) -> Span: """ Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation and the largest end offset of any annotation. (Builds the offset index) """ self._create_index_by_offset() return Span(self._index_by_offset.min_start(), self._index_by_offset.max_end())
property/get start
-
Returns the smallest start offset of all annotations, i.e the start of the span of the whole set. This needs the index and creates it if necessary.
Throws
an exception if there are no annotations in the set.
Expand source code
@property def start(self): """ Returns the smallest start offset of all annotations, i.e the start of the span of the whole set. This needs the index and creates it if necessary. Throws: an exception if there are no annotations in the set. """ if self.size == 0: raise Exception("Annotation set is empty, cannot determine start offset") self._create_index_by_offset() return self._index_by_offset.min_start()
property/get type_names : KeysView[str]
-
Gets the names of all types in this set. Creates the type index if necessary.
Expand source code
@property def type_names(self) -> KeysView[str]: """ Gets the names of all types in this set. Creates the type index if necessary. """ self._create_index_by_type() return self._index_by_type.keys()
Methods
def add(self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None)
-
Adds an annotation to the set. Once an annotation has been added, the start and end offsets, the type, and the annotation id of the annotation are immutable.
Args
start
- start offset
end
- end offset
anntype
- the annotation type
features
- a map, an iterable of tuples or an existing feature map. In any case, the features are used to create a new feature map for this annotation. If the map is empty or this parameter is None, the annotation does not store any map at all.
annid
- the annotation id, if not specified the next free one for this set is used. NOTE: the id should normally left unspecified and get assigned automatically.
Returns
the new annotation
Expand source code
@allowspan def add( self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None, ): """ Adds an annotation to the set. Once an annotation has been added, the start and end offsets, the type, and the annotation id of the annotation are immutable. Args: start: start offset end: end offset anntype: the annotation type features: a map, an iterable of tuples or an existing feature map. In any case, the features are used to create a new feature map for this annotation. If the map is empty or this parameter is None, the annotation does not store any map at all. annid: the annotation id, if not specified the next free one for this set is used. NOTE: the id should normally left unspecified and get assigned automatically. Returns: the new annotation """ if annid is not None and not isinstance(annid, int): raise Exception("Parameter annid must be an int, mixed up with features?") if features is not None and isinstance(features, int): raise Exception( "Parameter features must not be an int: mixed up with annid?" ) if self._is_immutable: raise Exception("Cannot add an annotation to an immutable annotation set") self._check_offsets(start, end) if annid and annid in self._annotations: raise Exception( "Cannot add annotation with id {}, already in set".format(annid) ) if annid is None: annid = self._next_annid self._next_annid = self._next_annid + 1 ann = Annotation(start, end, anntype, features=features, annid=annid) ann._owner_set = self if not self._annotations: self._annotations = {} self._annotations[annid] = ann self._add_to_indices(ann) if self.changelog is not None: entry = { "command": "annotation:add", "set": self.name, "start": ann.start, "end": ann.end, "type": ann.type, "features": ann._features.to_dict(), "id": ann.id, } self.changelog.append(entry) return ann
def add_ann(self, ann, annid: int = None)
-
Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or with the one given.
Args
ann
- the annotation to copy into the set
annid
- the annotation id, if not specified the next free one for this set is used. Note: the id should normally left unspecified and get assigned automatically.
Returns
the added annotation
Expand source code
def add_ann(self, ann, annid: int = None): """ Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or with the one given. Args: ann: the annotation to copy into the set annid: the annotation id, if not specified the next free one for this set is used. Note: the id should normally left unspecified and get assigned automatically. Returns: the added annotation """ return self.add(ann.start, ann.end, ann.type, ann.features, annid=annid)
def by_offset(self)
-
Yields lists of annotations which start at the same offset.
Expand source code
def by_offset(self): """ Yields lists of annotations which start at the same offset. """ self._create_index_by_offset() lastoff = -1 curlist = [] for ann in self.iter(): if ann.start != lastoff: if lastoff != -1: yield curlist lastoff = ann.start curlist = [ann] else: curlist.append(ann) if lastoff != -1: yield curlist
def by_span(self)
-
Yields list of annotations with identical spans.
Expand source code
def by_span(self): """ Yields list of annotations with identical spans. """ self._create_index_by_offset() lastsoff = -1 lasteoff = -1 curlist = [] for ann in self.iter(): if ann.start != lastsoff or ann.end != lasteoff: if lastsoff != -1: yield curlist lastsoff = ann.start lasteoff = ann.end curlist = [ann] else: curlist.append(ann) if lastsoff != -1: yield curlist
def clear(self) ‑> NoneType
-
Removes all annotations from the set.
Expand source code
def clear(self) -> None: """ Removes all annotations from the set. """ self._annotations.clear() self._index_by_offset = None self._index_by_type = None if self.changelog is not None: self.changelog.append({"command": "annotations:clear", "set": self.name})
def clone_anns(self, memo=None)
-
Replaces the annotations in this set with deep copies of the originals. If this is a detached set, then this makes sure that any modifications to the annotations do not affect the original annotations in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect the annotations in this set any more. The owning set of the annotations that get cloned is cleared.
Args
memo
- for internal use by our deepcopy implementation.
Expand source code
def clone_anns(self, memo=None): """ Replaces the annotations in this set with deep copies of the originals. If this is a detached set, then this makes sure that any modifications to the annotations do not affect the original annotations in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect the annotations in this set any more. The owning set of the annotations that get cloned is cleared. Args: memo: for internal use by our __deepcopy__ implementation. """ tmpdict = {} for annid, ann in self._annotations.items(): newann = copy.deepcopy(ann, memo=memo) ann._owner_set = None tmpdict[annid] = newann for annid, ann in tmpdict.items(): self._annotations[annid] = ann
def coextensive(self, start: int, end: int, annid=None, include_self=False) ‑> AnnotationSet
-
Returns a detached annotation set with all annotations that start and end at the given offsets.
For each annotation ann in the result set, ann.coextensive(span) is True.
Args
start
- start offset of the span
end
- end offset of the span
annid
- the annotation id of the annotation representing the span. (Default value = None)
include_self
- if True and the annotation id for the span is given, do not include that annotation in the result set.
Returns
annotation set with all annotations that have the same start and end offsets.
Expand source code
@support_annotation_or_set def coextensive( self, start: int, end: int, annid=None, include_self=False ) -> "AnnotationSet": """ Returns a detached annotation set with all annotations that start and end at the given offsets. For each annotation ann in the result set, ann.coextensive(span) is True. Args: start: start offset of the span end: end offset of the span annid: the annotation id of the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. Returns: annotation set with all annotations that have the same start and end offsets. """ self._create_index_by_offset() intvs = self._index_by_offset.at(start, end) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore)
def contains(self, annorannid: Union[int, Annotation]) ‑> bool
-
Provides 'annotation in annotation_set' functionality
Args
:param annorannid: the annotation instance or annotation id to check
Returns
True
if the annotation exists in the set,False
otherwiseExpand source code
def __contains__(self, annorannid: Union[int, Annotation]) -> bool: """ Provides 'annotation in annotation_set' functionality Args: :param annorannid: the annotation instance or annotation id to check Returns: `True` if the annotation exists in the set, `False` otherwise """ if isinstance(annorannid, Annotation): return annorannid.id in self._annotations return ( annorannid in self._annotations ) # On the off chance someone passed an ID in directly
def copy(self)
-
Returns a shallow copy of the annotation set.
Expand source code
def copy(self): """ Returns a shallow copy of the annotation set. """ return self.__copy__()
def covering(self, start: int, end: int, annid=None, include_self=False) ‑> AnnotationSet
-
Gets the annotations which contain the given offset range (or annotation/annotation set), i.e. annotations such that the given offset range is within the annotation.
For each annotation ann in the result set, ann.covering(span) is True.
Args
start
- the start offset of the span
end
- the end offset of the span
annid
- the annotation id of the annotation representing the span. (Default value = None)
include_self
- if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False)
Returns
an immutable annotation set with the matching annotations, if any
Expand source code
@support_annotation_or_set def covering( self, start: int, end: int, annid=None, include_self=False ) -> "AnnotationSet": """ Gets the annotations which contain the given offset range (or annotation/annotation set), i.e. annotations such that the given offset range is within the annotation. For each annotation ann in the result set, ann.covering(span) is True. Args: start: the start offset of the span end: the end offset of the span annid: the annotation id of the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations, if any """ self._create_index_by_offset() intvs = self._index_by_offset.covering(start, end) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore)
def deepcopy(self)
-
Returns a deep copy of the annotation set.
Expand source code
def deepcopy(self): """ Returns a deep copy of the annotation set. """ return copy.deepcopy(self)
def detach(self, restrict_to=None) ‑> AnnotationSet
-
Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids. A detached annotation set does not have an owning document and deleting or adding annotations does not change the annotations stored with the document. However, the annotations in a detached annotation set are the same as those stored in the attached set, so updating their features will modify the annotations in the document as well.
Args
restrict_to
- an iterable of annotation ids, if None, all the annotations from this set.
Returns
an immutable annotation set
Expand source code
def detach(self, restrict_to=None) -> "AnnotationSet": """ Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids. A detached annotation set does not have an owning document and deleting or adding annotations does not change the annotations stored with the document. However, the annotations in a detached annotation set are the same as those stored in the attached set, so updating their features will modify the annotations in the document as well. Args: restrict_to: an iterable of annotation ids, if None, all the annotations from this set. Returns: an immutable annotation set """ annset = AnnotationSet(name="detached-from:" + self.name) annset._is_immutable = True if restrict_to is None: annset._annotations = { annid: self._annotations[annid] for annid in self._annotations.keys() } else: annset._annotations = { annid: self._annotations[annid] for annid in restrict_to } annset._next_annid = self._next_annid return annset
def detach_from(self, anns: collections.abc.Iterable) ‑> AnnotationSet
-
Creates an immutable detached annotation set from the annotations in anns which could by either a collection of annotations or annotation ids (int numbers) which are assumed to be the annotation ids from this set.
The next annotation id for the created set is the highest seen annotation id from anns plus one.
Args
anns
- an iterable of annotations
Returns
an immutable detached annotation set
Expand source code
def detach_from(self, anns: Iterable) -> "AnnotationSet": """ Creates an immutable detached annotation set from the annotations in anns which could by either a collection of annotations or annotation ids (int numbers) which are assumed to be the annotation ids from this set. The next annotation id for the created set is the highest seen annotation id from anns plus one. Args: anns: an iterable of annotations Returns: an immutable detached annotation set """ annset = AnnotationSet(name="detached-from:" + self.name) annset._is_immutable = True annset._annotations = {} nextid = -1 for ann in anns: if isinstance(ann, int): annset._annotations[ann] = self._annotations[ann] annid = ann else: annset._annotations[id] = ann annid = ann.id if annid > nextid: nextid = annid annset._next_annid = nextid + 1 return annset
def fast_iter(self) ‑> Generator
-
Yields annotations in insertion order. This is faster then the default iterator and does not need to index (so if the index does not exist, it will not be built).
Expand source code
def fast_iter(self) -> Generator: """ Yields annotations in insertion order. This is faster then the default iterator and does not need to index (so if the index does not exist, it will not be built). """ if self._annotations: for annid, ann in self._annotations.items(): yield ann
def first(self)
-
Args:
Returns
:return: first annotation
Expand source code
def first(self): """ Args: Returns: :return: first annotation """ sz = len(self._annotations) if sz == 0: raise Exception("Empty set, there is no first annotation") elif sz == 1: return next(iter(self._annotations.values())) self._create_index_by_offset() _, _, annid = next(self._index_by_offset.irange(reverse=False)) return self._annotations[annid]
def get(self, annid: Union[int, Annotation], default=None) ‑> Union[Annotation, NoneType]
-
Gets the annotation with the given annotation id or returns the given default.
NOTE: for handling cases where legacy code still expects the add method to return an id and not the annotation, this will accept an annotation so the the frequent pattern still works:
annid = annset.add(b,e,t).id ann = annset.get(annid)
If an annotation is passed the annotation from the set with the id of that annotation is returned, if the annotation is from that set, this will return the same object, if it is still in the set (or return the default value).
Args
annid
- the annotation id of the annotation to retrieve.
default
- what to return if an annotation with the given id is not found. (Default value = None)
annid
- Union[int:
Annotation]:
Returns
the annotation or the default value.
Expand source code
def get( self, annid: Union[int, Annotation], default=None ) -> Union[Annotation, None]: """Gets the annotation with the given annotation id or returns the given default. NOTE: for handling cases where legacy code still expects the add method to return an id and not the annotation, this will accept an annotation so the the frequent pattern still works: annid = annset.add(b,e,t).id ann = annset.get(annid) If an annotation is passed the annotation from the set with the id of that annotation is returned, if the annotation is from that set, this will return the same object, if it is still in the set (or return the default value). Args: annid: the annotation id of the annotation to retrieve. default: what to return if an annotation with the given id is not found. (Default value = None) annid: Union[int: Annotation]: Returns: the annotation or the default value. """ if isinstance(annid, Annotation): annid = annid.id return self._annotations.get(annid, default)
def isdetached(self) ‑> bool
-
Returns True if the annotation set is detached, False otherwise.
Expand source code
def isdetached(self) -> bool: """ Returns True if the annotation set is detached, False otherwise. """ return self._owner_doc is None
def iter(self, start_ge: Union[int, NoneType] = None, start_lt: Union[NoneType, int] = None, with_type: str = None, reverse: bool = False) ‑> Generator
-
Yields annotations in document order, otionally limited by the other parameters. If two annoations start at the same offset, they are always ordered by increasing annotation id.
Args
start_ge
- the offset from where to start including annotations
start_lt
- the last offset to use as the starting offset of an annotation
with_type
- only annotations of this type
reverse
- process in reverse document order
Yields
annotations in document order
Expand source code
def iter( self, start_ge: Union[int, None] = None, start_lt: Union[None, int] = None, with_type: str = None, reverse: bool = False, ) -> Generator: """ Yields annotations in document order, otionally limited by the other parameters. If two annoations start at the same offset, they are always ordered by increasing annotation id. Args: start_ge: the offset from where to start including annotations start_lt: the last offset to use as the starting offset of an annotation with_type: only annotations of this type reverse: process in reverse document order Yields: annotations in document order """ if with_type is not None: allowedtypes = set() if isinstance(type, str): allowedtypes.add(with_type) else: for atype in with_type: allowedtypes.add(atype) else: allowedtypes = None if not self._annotations: return maxoff = None if start_ge is not None: assert start_ge >= 0 if start_lt is not None: assert start_lt >= 1 maxoff = start_lt + 1 if start_lt is not None and start_ge is not None: assert start_lt > start_ge self._create_index_by_offset() for _start, _end, annid in self._index_by_offset.irange( minoff=start_ge, maxoff=maxoff, reverse=reverse ): if ( allowedtypes is not None and self._annotations[annid].type not in allowedtypes ): continue yield self._annotations[annid]
def last(self)
-
Args:
Returns
:return: first annotation
Expand source code
def last(self): """ Args: Returns: :return: first annotation """ sz = len(self._annotations) if sz == 0: raise Exception("Empty set, there is no last annotation") elif sz == 1: return next(iter(self._annotations.values())) self._create_index_by_offset() _, _, annid = next(self._index_by_offset.irange(reverse=True)) return self._annotations[annid]
def overlapping(self, start: int, end: int, annid=None, include_self=False) ‑> AnnotationSet
-
Gets annotations overlapping with the given span. Instead of the start and end offsets, also accepts an annotation or annotation set.
For each annotation ann in the result set, ann.overlapping(span) is True
Args
start
- start offset of the span
end
- end offset of the span
annid
- the annotation id of the annotation representing the span. (Default value = None)
include_self
- if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False)
Returns
an immutable annotation set with the matching annotations
Expand source code
@support_annotation_or_set def overlapping( self, start: int, end: int, annid=None, include_self=False ) -> "AnnotationSet": """ Gets annotations overlapping with the given span. Instead of the start and end offsets, also accepts an annotation or annotation set. For each annotation ann in the result set, ann.overlapping(span) is True Args: start: start offset of the span end: end offset of the span annid: the annotation id of the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.overlapping(start, end) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore)
def remove(self, annoriter: Union[int, Annotation, collections.abc.Iterable], raise_on_notexisting=True) ‑> NoneType
-
Removes the given annotation which is either the id or the annotation instance or recursively all annotations in the iterable.
Throws
exception if the annotation set is immutable or the annotation is not in the set
Args
annoriter
- either the id (int) or the annotation instance (Annotation) or an iterable of id or annotation instance or iterable …
raise_on_notexisting
- (default: True) if false, silently accepts non-existing annotations/ids and does nothing. Note: if this is True, but the annotation set is immutable, an Exception is still raised.
Expand source code
def remove( self, annoriter: Union[int, Annotation, Iterable], raise_on_notexisting=True ) -> None: """ Removes the given annotation which is either the id or the annotation instance or recursively all annotations in the iterable. Throws: exception if the annotation set is immutable or the annotation is not in the set Args: annoriter: either the id (int) or the annotation instance (Annotation) or an iterable of id or annotation instance or iterable ... raise_on_notexisting: (default: True) if false, silently accepts non-existing annotations/ids and does nothing. Note: if this is True, but the annotation set is immutable, an Exception is still raised. """ if self._is_immutable: raise Exception( "Cannot remove an annotation from an immutable annotation set" ) if isinstance(annoriter, Iterable): for a in annoriter: self.remove(a, raise_on_notexisting=raise_on_notexisting) return annid = None # make pycharm happy if isinstance(annoriter, int): annid = annoriter if annid not in self._annotations: raise Exception( "Annotation with id {} not in annotation set, cannot remove".format( annid ) ) annoriter = self._annotations[annid] elif isinstance(annoriter, Annotation): annid = annoriter.id if annid not in self._annotations: raise Exception( "Annotation with id {} does not belong to this set, cannot remove".format( annid ) ) # NOTE: once the annotation has been removed from the set, it could still be referenced # somewhere else and its features could get modified. In order to prevent logging of such changes, # the owning set gets cleared for the annotation annoriter._owner_set = None del self._annotations[annid] if self.changelog is not None: self.changelog.append( {"command": "annotation:remove", "set": self.name, "id": annid} ) self._remove_from_indices(annoriter)
def reverse_iter(self, **kwargs)
-
Same as iter, but with the reverse parameter set to true.
Args
kwargs
- Same as for iter(), with revers=True fixed.
**kwargs
- will get passed on the Annotation.iter
Returns
same result as iter()
Expand source code
def reverse_iter(self, **kwargs): """ Same as iter, but with the reverse parameter set to true. Args: kwargs: Same as for iter(), with revers=True fixed. **kwargs: will get passed on the Annotation.iter Returns: same result as iter() """ return self.iter(reverse=True, **kwargs)
def start_eq(self, start: int, ignored: Any = None, annid=None, include_self=False) ‑> AnnotationSet
-
Gets all annotations starting at the given offset (empty if none) and returns them in a detached annotation set.
Note: this can be called with an annotation or annotation set instead of the start offset. If called with an annotation, this annotation is not included in the result set if
include_self
isFalse
Args
start
- the offset where annotations should start
ignored
- dummy parameter to allow the use of annotations and annotation sets
annid
- dummy parameter to allow the use of annotations and annotation sets
include_self
- should annotation passed be included in the result
Returns
detached annotation set of matching annotations
Expand source code
@support_annotation_or_set def start_eq( self, start: int, ignored: Any = None, annid=None, include_self=False ) -> "AnnotationSet": """ Gets all annotations starting at the given offset (empty if none) and returns them in a detached annotation set. Note: this can be called with an annotation or annotation set instead of the start offset. If called with an annotation, this annotation is not included in the result set if `include_self` is `False` Args: start: the offset where annotations should start ignored: dummy parameter to allow the use of annotations and annotation sets annid: dummy parameter to allow the use of annotations and annotation sets include_self: should annotation passed be included in the result Returns: detached annotation set of matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(start) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore)
def start_ge(self, start: int, ignored: Any = None, annid=None, include_self=False) ‑> AnnotationSet
-
Return the annotations that start at or after the given start offset.
Args
start
- Start offset
ignored
- dummy parameter to allow the use of annotations and annotation sets
annid
- annotation id
include_self
- should annotation passed be included in the result
Returns
an immutable annotation set of the matching annotations
Expand source code
@support_annotation_or_set def start_ge( self, start: int, ignored: Any = None, annid=None, include_self=False ) -> "AnnotationSet": """Return the annotations that start at or after the given start offset. Args: start: Start offset ignored: dummy parameter to allow the use of annotations and annotation sets annid: annotation id include_self: should annotation passed be included in the result Returns: an immutable annotation set of the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(start) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore)
def start_lt(self, offset: int, ignored: Any = None, annid=None) ‑> AnnotationSet
-
Returns the annotations that start before the given offset (or annotation). This also accepts an annotation or set.
Args
offset
- offset before which the annotations should start
ignored
- dummy parameter to allow the use of annotations and annotation sets
annid
- annotation id
Returns
an immutable annotation set of the matching annotations
Expand source code
@support_annotation_or_set def start_lt(self, offset: int, ignored: Any = None, annid=None) -> "AnnotationSet": """ Returns the annotations that start before the given offset (or annotation). This also accepts an annotation or set. Args: offset: offset before which the annotations should start ignored: dummy parameter to allow the use of annotations and annotation sets annid: annotation id Returns: an immutable annotation set of the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_before(offset) return self._restrict_intvs(intvs)
def start_min_ge(self, offset: int, ignored: Any = None, annid=None, include_self=False) ‑> AnnotationSet
-
Gets all annotations starting at the first possible offset at or after the given offset and returns them in an immutable annotation set.
Args
offset
- The offset
ignored
- dummy parameter to allow the use of annotations and annotation sets
annid
- annotation id
include_self
- should annotation passed be included in the result
Returns
annotation set of matching annotations
Expand source code
@support_annotation_or_set def start_min_ge( self, offset: int, ignored: Any = None, annid=None, include_self=False ) -> "AnnotationSet": """Gets all annotations starting at the first possible offset at or after the given offset and returns them in an immutable annotation set. Args: offset: The offset ignored: dummy parameter to allow the use of annotations and annotation sets annid: annotation id include_self: should annotation passed be included in the result Returns: annotation set of matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(offset) # now select only those first ones which all have the same offset if not include_self and annid is not None: ignore = annid else: ignore = None retids = set() startoff = None for intv in intvs: if startoff is None: startoff = intv[0] if ignore is not None: if ignore != intv[2]: retids.add(intv[2]) else: retids.add(intv[2]) elif startoff == intv[0]: if ignore is not None: if ignore != intv[2]: retids.add(intv[2]) else: retids.add(intv[2]) else: break return self.detach(restrict_to=retids)
def to_dict(self, anntypes=None, **kwargs)
-
Convert an annotation set to its dict representation.
Args
anntypes
- if not None, an iterable of annotation types to include
**kwargs
- passed on to the dict creation of contained annotations.
Returns
the dict representation of the annotation set.
Expand source code
def to_dict(self, anntypes=None, **kwargs): """ Convert an annotation set to its dict representation. Args: anntypes: if not None, an iterable of annotation types to include **kwargs: passed on to the dict creation of contained annotations. Returns: the dict representation of the annotation set. """ if anntypes is not None: anntypesset = set(anntypes) anns_list = list( val.to_dict(**kwargs) for val in self._annotations.values() if val.type in anntypesset ) else: anns_list = list( val.to_dict(**kwargs) for val in self._annotations.values() ) return { # NOTE: Changelog is not getting added as it is stored in the document part! "name": self.name, "annotations": anns_list, "next_annid": self._next_annid, }
def with_type(self, *anntype: collections.abc.Iterable, non_overlapping: bool = False) ‑> AnnotationSet
-
Gets annotations of the specified type(s). Creates the type index if necessary.
Args
anntype
- one or more types or type lists. The union of all types specified that way is used to filter the annotations. If no type is specified, all annotations are selected.
non_overlapping
- if True, only return annotations of any of the given types which do not overlap with other annotations. If there are several annotations that start at the same offset, use the type that comes first in the parameters, if there are more than one of that type, use the one that would come first in the usual sort order.
Returns
a detached immutable annotation set with the matching annotations.
Expand source code
def with_type( self, *anntype: Union[str, Iterable], non_overlapping: bool = False ) -> "AnnotationSet": """ Gets annotations of the specified type(s). Creates the type index if necessary. Args: anntype: one or more types or type lists. The union of all types specified that way is used to filter the annotations. If no type is specified, all annotations are selected. non_overlapping: if True, only return annotations of any of the given types which do not overlap with other annotations. If there are several annotations that start at the same offset, use the type that comes first in the parameters, if there are more than one of that type, use the one that would come first in the usual sort order. Returns: a detached immutable annotation set with the matching annotations. """ atypes = [] for atype in anntype: if isinstance(atype, str): atypes.append(atype) else: for t in atype: atypes.append(t) if not atypes: return self.detach() self._create_index_by_type() annids = set() for t in atypes: idxs = self._index_by_type.get(t) if idxs: annids.update(idxs) if non_overlapping: # need to get annotations grouped by start offset and sorted according to # what the Annotation class defines allanns = sorted(annids, key=lambda x: self._annotations[x]) allanns = [self._annotations[x] for x in allanns] allannsgrouped = [] curstart = None curset = None for ann in allanns: if curstart is None: curset = [ann] curstart = ann.start elif curstart == ann.start: curset.append(ann) else: allannsgrouped.append(curset) curset = [ann] curstart = ann.start if curset: allannsgrouped.append(curset) retanns = [] # now go through all the grouped annoations and select the top priority one # then skip to the next group that does not overlap with the one we just selected typepriority = dict() for i, atype in enumerate(atypes): typepriority[atype] = len(atypes) - i curminoffset = 0 for group in allannsgrouped: # instead of sorting, go through the group and find the top priority one topann = None if len(group) == 1: if group[0].start >= curminoffset: topann = group[0] elif len(group) == 0: raise Exception("We should never get a 0 size group here!") else: for i, ann in enumerate(group): if ann.start >= curminoffset: topann = ann break for ann in group[i + 1 :]: if ann.start < curminoffset: continue if typepriority[ann.type] > typepriority[topann.type]: topann = ann elif typepriority[ann.type] == typepriority[topann.type]: if ann.end > topann.end: topann = ann elif ann.end == topann.end: if ann.id > topann.id: topann = ann if topann is not None: retanns.append(topann) curminoffset = topann.end annids = [ann.id for ann in retanns] return self.detach(restrict_to=annids)
def within(self, start: int, end: int, annid=None, include_self=False) ‑> AnnotationSet
-
Gets annotations that fall completely within the given offset range, i.e. annotations such that the offset range is covering each of the annotation.
For each annotation ann in the result set, ann.within(span) is True.
Args
start
- start offset of the range
end
- end offset of the range
annid
- the annotation id of the annotation representing the span. (Default value = None)
include_self
- if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False)
Returns
an immutable annotation set with the matching annotations
Expand source code
@support_annotation_or_set def within( self, start: int, end: int, annid=None, include_self=False ) -> "AnnotationSet": """ Gets annotations that fall completely within the given offset range, i.e. annotations such that the offset range is covering each of the annotation. For each annotation ann in the result set, ann.within(span) is True. Args: start: start offset of the range end: end offset of the range annid: the annotation id of the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations """ if start == end: intvs = [] elif start > end: raise Exception("Invalid offset range: {},{}".format(start, end)) else: self._create_index_by_offset() intvs = self._index_by_offset.within(start, end) if not include_self and annid is not None: ignore = annid else: ignore = None return self._restrict_intvs(intvs, ignore=ignore)
class ChangeLog (store=True)
-
Creates a ChangeLog.
A ChangeLog stores a log of all changes applied to a document. That log can be used to recreate the document from its initial version in a different process or at a later time.
Args
store
- if
True
, the change log stores the actions it receives (default). This can be set
to false if only callbacks are needed.
Expand source code
class ChangeLog: def __init__(self, store=True): """ Creates a ChangeLog. A ChangeLog stores a log of all changes applied to a document. That log can be used to recreate the document from its initial version in a different process or at a later time. Args: store: if `True`, the change log stores the actions it receives (default). This can be set to false if only callbacks are needed. """ self.changes = [] self.offset_type = OFFSET_TYPE_PYTHON self._handlers = dict() self._store = store def add_handler(self, actions, handler): """ Registers a handler to get called back when any of the actions is added. If any handler was already registered for one or more of the actions, the new handler overrides it. Args: actions: either a single action string or a collection of several action strings handler: a callable that takes the change information """ if isinstance(actions, str): actions = [actions] for a in actions: if a not in ACTIONS: raise Exception(f"Action {a} not known, cannot add handler") self._handlers[a] = handler def append(self, change: Dict): """ Add a change to the change log. The change must be represented as a dictionary which follows the conventions of how to represent changes. This is not using an abstraction yet. Args: change: dict describing the action/modification """ assert isinstance(change, dict) action = change.get("command", None) if action is None: raise Exception("Odd change, does not have 'command' key") if self._store: self.changes.append(change) hndlr = self._handlers.get(action) if hndlr: hndlr() def __len__(self) -> int: """ Returns the number of actions logged in the ChangeLog. """ return len(self.changes) def _fixup_changes(self, method: Callable, replace=False) -> List[Dict]: """In-place modify the annotation offsets of the changes according to the given method. Args: method: an object method method for converting offsets from or to python. replace: if True, modifies the original change objects in the changelog, otherwise, uses copies (Default value = False) method: Callable: Returns: the modified changes, a reference to the modified changes list of the instance """ if not replace: newchanges = [] for change in self.changes: if not replace: chg = dict(change) else: chg = change if "start" in change: chg["start"] = method(change["start"]) if "end" in change: chg["end"] = method(change["end"]) if not replace: newchanges.append(chg) if replace: return self.changes else: return newchanges def fixup_changes(self, offset_mapper, offset_type, replace=True): """Update the offsets of all annotations in this changelog to the desired offset type, if necessary. If the ChangeLog already has that offset type, this does nothing. Args: offset_mapper: a prepared offset mapper to use offset_type: the desired offset type replace: if True, replaces the original offsets in the original change objects, otherwise creates new change objects and a new changes list and returs it. (Default value = True) Returns: a reference to the modified changes """ if offset_type != self.offset_type: if offset_type == OFFSET_TYPE_JAVA: method = offset_mapper.convert_to_java elif offset_type == OFFSET_TYPE_PYTHON: method = offset_mapper.convert_to_python else: raise Exception("Not a proper offset type: {}".format(offset_type)) if replace: self.offset_type = offset_type return self._fixup_changes(method, replace=replace) else: return self.changes def __repr__(self) -> str: return "ChangeLog([{}])".format(",".join([str(c) for c in self.changes])) def format_to(self, fp, prefix="") -> None: """ Prints the log to the given stream. Args: fp: stream to print to prefix: something to print in front of each action, default="" """ for c in self.changes: print(prefix, str(c), sep="", file=fp) def to_dict(self, **kwargs): """ Returns a dict representation of the ChangeLog. Args: **kwargs: ignored """ offset_type = self.offset_type changes = self.changes if "offset_type" in kwargs and kwargs["offset_type"] != offset_type: om = kwargs.get("offset_mapper") if om is None: raise Exception( "Need to convert offsets, but no offset_mapper parameter given" ) offset_type = kwargs["offset_type"] if offset_type == OFFSET_TYPE_JAVA: changes = self._fixup_changes(om.convert_to_java, replace=False) else: changes = self._fixup_changes(om.convert_to_python, replace=False) return {"changes": changes, "offset_type": offset_type} @staticmethod def from_dict(dictrepr, **kwargs): """ Creates a ChangeLog from a dict representation. Args: dictrepr: the dict representation to convert **kwargs: ignored """ if dictrepr is None: return None cl = ChangeLog() cl.changes = dictrepr.get("changes") cl.offset_type = dictrepr.get("offset_type") if cl.offset_type == OFFSET_TYPE_JAVA: # we need either an offset mapper or a document if "offset_mapper" in kwargs: om = kwargs.get("offset_mapper") elif "document" in kwargs: om = OffsetMapper(kwargs.get("document")) else: raise Exception( "Loading a changelog with offset_type JAVA, need kwarg 'offset_mapper' or 'document'" ) cl._fixup_changes(om.convert_to_python) return cl def save( self, whereto, fmt="json", offset_type=None, offset_mapper=None, mod="gatenlp.serialization.default", **kwargs, ): """ Save the document in the given format. Additional keyword parameters for format "json": as_array: boolean, if True stores as array instead of dictionary Args: whereto: either a file name or something that has a write(string) method. fmt: serialization format, one of "json", "msgpack" or "pickle" (Default value = "json") offset_type: store using the given offset type or keep the current if None (Default value = None) offset_mapper: nedded if the offset type should get changed (Default value = None) mod: module to use (Default value = "gatenlp.serialization.default") **kwargs: additional parameters for the format """ m = importlib.import_module(mod) saver = m.get_changelog_saver(whereto, fmt) saver( ChangeLog, self, to_ext=whereto, offset_type=offset_type, offset_mapper=offset_mapper, **kwargs, ) def save_mem( self, fmt="json", offset_type=None, offset_mapper=None, mod="gatenlp.serialization.default", **kwargs, ): """ Serialize and save to a string. Additional keyword parameters for format "json": as_array: boolean, if True stores as array instead of dictionary, using to Args: fmt: serialization format, one of "json", "msgpack" or "pickle" (Default value = "json") offset_type: store using the given offset type or keep the current if None (Default value = None) offset_mapper: nedded if the offset type should get changed (Default value = None) mod: module to use (Default value = "gatenlp.serialization.default") **kwargs: additional parameters for the format """ m = importlib.import_module(mod) saver = m.get_changelog_saver(None, fmt) return saver( ChangeLog, self, to_mem=True, offset_type=offset_type, offset_mapper=offset_mapper, **kwargs, ) @staticmethod def load( wherefrom, fmt="json", offset_mapper=None, mod="gatenlp.serialization.default", **kwargs, ): """ Load ChangeLog from some serialization. Args: wherefrom: the file or URL to load from offset_mapper: offset mapper in case the offsets need to get converted (Default value = None) fmt: the format to use (Default value = "json") mod: (Default value = "gatenlp.serialization.default") **kwargs: any arguments to pass on the the loader Returns: the ChangeLog instance """ m = importlib.import_module(mod) loader = m.get_changelog_loader(wherefrom, fmt) chl = loader( ChangeLog, from_ext=wherefrom, offset_mapper=offset_mapper, **kwargs ) if chl.offset_type == OFFSET_TYPE_JAVA: chl.fixup_changes( offset_mapper, offset_type=OFFSET_TYPE_PYTHON, replace=True ) return chl @staticmethod def load_mem( wherefrom, fmt="json", offset_mapper=None, mod="gatenlp.serialization.default", **kwargs, ): """ Load a ChangeLog from a string representation in the given format. Note: the offset type is always converted to PYTHON when loading! Args: wherefrom: the string to deserialize fmt: the format to use, default: "json" offset_mapper: offset mapper in case the offsets need to get converted (Default value = None) mod: (Default value = "gatenlp.serialization.default") **kwargs: arguments to pass on to the loader Returns: the ChangeLog instance """ m = importlib.import_module(mod) loader = m.get_changelog_loader(None, fmt) chl = loader( ChangeLog, from_mem=wherefrom, offset_mapper=offset_mapper, **kwargs ) if chl.offset_type == OFFSET_TYPE_JAVA: chl.fixup_changes( offset_mapper, offset_type=OFFSET_TYPE_PYTHON, replace=True ) return chl def pprint(self, out=None): """ Pretty prints to the given output stream, sys.stdout if not given. Args: out: the stream to print to, if None uses sys.stdout """ if out is None: out = sys.stdout print("ChangeLog(", file=out) for i, c in enumerate(self.changes): cmd = c.get("command") parms = c.copy() del parms["command"] print(f"{i}: cmd={cmd} {parms}") print(")")
Static methods
def from_dict(dictrepr, **kwargs)
-
Creates a ChangeLog from a dict representation.
Args
dictrepr
- the dict representation to convert
**kwargs
- ignored
Expand source code
@staticmethod def from_dict(dictrepr, **kwargs): """ Creates a ChangeLog from a dict representation. Args: dictrepr: the dict representation to convert **kwargs: ignored """ if dictrepr is None: return None cl = ChangeLog() cl.changes = dictrepr.get("changes") cl.offset_type = dictrepr.get("offset_type") if cl.offset_type == OFFSET_TYPE_JAVA: # we need either an offset mapper or a document if "offset_mapper" in kwargs: om = kwargs.get("offset_mapper") elif "document" in kwargs: om = OffsetMapper(kwargs.get("document")) else: raise Exception( "Loading a changelog with offset_type JAVA, need kwarg 'offset_mapper' or 'document'" ) cl._fixup_changes(om.convert_to_python) return cl
def load(wherefrom, fmt='json', offset_mapper=None, mod='gatenlp.serialization.default', **kwargs)
-
Load ChangeLog from some serialization.
Args
wherefrom
- the file or URL to load from
offset_mapper
- offset mapper in case the offsets need to get converted (Default value = None)
fmt
- the format to use (Default value = "json")
mod
- (Default value = "gatenlp.serialization.default")
**kwargs
- any arguments to pass on the the loader
Returns
the ChangeLog instance
Expand source code
@staticmethod def load( wherefrom, fmt="json", offset_mapper=None, mod="gatenlp.serialization.default", **kwargs, ): """ Load ChangeLog from some serialization. Args: wherefrom: the file or URL to load from offset_mapper: offset mapper in case the offsets need to get converted (Default value = None) fmt: the format to use (Default value = "json") mod: (Default value = "gatenlp.serialization.default") **kwargs: any arguments to pass on the the loader Returns: the ChangeLog instance """ m = importlib.import_module(mod) loader = m.get_changelog_loader(wherefrom, fmt) chl = loader( ChangeLog, from_ext=wherefrom, offset_mapper=offset_mapper, **kwargs ) if chl.offset_type == OFFSET_TYPE_JAVA: chl.fixup_changes( offset_mapper, offset_type=OFFSET_TYPE_PYTHON, replace=True ) return chl
def load_mem(wherefrom, fmt='json', offset_mapper=None, mod='gatenlp.serialization.default', **kwargs)
-
Load a ChangeLog from a string representation in the given format.
Note: the offset type is always converted to PYTHON when loading!
Args
wherefrom
- the string to deserialize
fmt
- the format to use, default: "json"
offset_mapper
- offset mapper in case the offsets need to get converted (Default value = None)
mod
- (Default value = "gatenlp.serialization.default")
**kwargs
- arguments to pass on to the loader
Returns
the ChangeLog instance
Expand source code
@staticmethod def load_mem( wherefrom, fmt="json", offset_mapper=None, mod="gatenlp.serialization.default", **kwargs, ): """ Load a ChangeLog from a string representation in the given format. Note: the offset type is always converted to PYTHON when loading! Args: wherefrom: the string to deserialize fmt: the format to use, default: "json" offset_mapper: offset mapper in case the offsets need to get converted (Default value = None) mod: (Default value = "gatenlp.serialization.default") **kwargs: arguments to pass on to the loader Returns: the ChangeLog instance """ m = importlib.import_module(mod) loader = m.get_changelog_loader(None, fmt) chl = loader( ChangeLog, from_mem=wherefrom, offset_mapper=offset_mapper, **kwargs ) if chl.offset_type == OFFSET_TYPE_JAVA: chl.fixup_changes( offset_mapper, offset_type=OFFSET_TYPE_PYTHON, replace=True ) return chl
Methods
def add_handler(self, actions, handler)
-
Registers a handler to get called back when any of the actions is added. If any handler was already registered for one or more of the actions, the new handler overrides it.
Args
actions
- either a single action string or a collection of several action strings
handler
- a callable that takes the change information
Expand source code
def add_handler(self, actions, handler): """ Registers a handler to get called back when any of the actions is added. If any handler was already registered for one or more of the actions, the new handler overrides it. Args: actions: either a single action string or a collection of several action strings handler: a callable that takes the change information """ if isinstance(actions, str): actions = [actions] for a in actions: if a not in ACTIONS: raise Exception(f"Action {a} not known, cannot add handler") self._handlers[a] = handler
def append(self, change: Dict)
-
Add a change to the change log. The change must be represented as a dictionary which follows the conventions of how to represent changes. This is not using an abstraction yet.
Args
change
- dict describing the action/modification
Expand source code
def append(self, change: Dict): """ Add a change to the change log. The change must be represented as a dictionary which follows the conventions of how to represent changes. This is not using an abstraction yet. Args: change: dict describing the action/modification """ assert isinstance(change, dict) action = change.get("command", None) if action is None: raise Exception("Odd change, does not have 'command' key") if self._store: self.changes.append(change) hndlr = self._handlers.get(action) if hndlr: hndlr()
def fixup_changes(self, offset_mapper, offset_type, replace=True)
-
Update the offsets of all annotations in this changelog to the desired offset type, if necessary. If the ChangeLog already has that offset type, this does nothing.
Args
offset_mapper
- a prepared offset mapper to use
offset_type
- the desired offset type
replace
- if True, replaces the original offsets in the original change objects, otherwise creates
new change objects and a new changes list and returs it. (Default value = True)
Returns
a reference to the modified changes
Expand source code
def fixup_changes(self, offset_mapper, offset_type, replace=True): """Update the offsets of all annotations in this changelog to the desired offset type, if necessary. If the ChangeLog already has that offset type, this does nothing. Args: offset_mapper: a prepared offset mapper to use offset_type: the desired offset type replace: if True, replaces the original offsets in the original change objects, otherwise creates new change objects and a new changes list and returs it. (Default value = True) Returns: a reference to the modified changes """ if offset_type != self.offset_type: if offset_type == OFFSET_TYPE_JAVA: method = offset_mapper.convert_to_java elif offset_type == OFFSET_TYPE_PYTHON: method = offset_mapper.convert_to_python else: raise Exception("Not a proper offset type: {}".format(offset_type)) if replace: self.offset_type = offset_type return self._fixup_changes(method, replace=replace) else: return self.changes
def format_to(self, fp, prefix='') ‑> NoneType
-
Prints the log to the given stream.
Args
fp
- stream to print to
prefix
- something to print in front of each action, default=""
Expand source code
def format_to(self, fp, prefix="") -> None: """ Prints the log to the given stream. Args: fp: stream to print to prefix: something to print in front of each action, default="" """ for c in self.changes: print(prefix, str(c), sep="", file=fp)
def pprint(self, out=None)
-
Pretty prints to the given output stream, sys.stdout if not given.
Args
out
- the stream to print to, if None uses sys.stdout
Expand source code
def pprint(self, out=None): """ Pretty prints to the given output stream, sys.stdout if not given. Args: out: the stream to print to, if None uses sys.stdout """ if out is None: out = sys.stdout print("ChangeLog(", file=out) for i, c in enumerate(self.changes): cmd = c.get("command") parms = c.copy() del parms["command"] print(f"{i}: cmd={cmd} {parms}") print(")")
def save(self, whereto, fmt='json', offset_type=None, offset_mapper=None, mod='gatenlp.serialization.default', **kwargs)
-
Save the document in the given format.
Additional keyword parameters for format "json": as_array: boolean, if True stores as array instead of dictionary
Args
whereto
- either a file name or something that has a write(string) method.
fmt
- serialization format, one of "json", "msgpack" or "pickle" (Default value = "json")
offset_type
- store using the given offset type or keep the current if None (Default value = None)
offset_mapper
- nedded if the offset type should get changed (Default value = None)
mod
- module to use (Default value = "gatenlp.serialization.default")
**kwargs
- additional parameters for the format
Expand source code
def save( self, whereto, fmt="json", offset_type=None, offset_mapper=None, mod="gatenlp.serialization.default", **kwargs, ): """ Save the document in the given format. Additional keyword parameters for format "json": as_array: boolean, if True stores as array instead of dictionary Args: whereto: either a file name or something that has a write(string) method. fmt: serialization format, one of "json", "msgpack" or "pickle" (Default value = "json") offset_type: store using the given offset type or keep the current if None (Default value = None) offset_mapper: nedded if the offset type should get changed (Default value = None) mod: module to use (Default value = "gatenlp.serialization.default") **kwargs: additional parameters for the format """ m = importlib.import_module(mod) saver = m.get_changelog_saver(whereto, fmt) saver( ChangeLog, self, to_ext=whereto, offset_type=offset_type, offset_mapper=offset_mapper, **kwargs, )
def save_mem(self, fmt='json', offset_type=None, offset_mapper=None, mod='gatenlp.serialization.default', **kwargs)
-
Serialize and save to a string.
Additional keyword parameters for format "json": as_array: boolean, if True stores as array instead of dictionary, using to
Args
fmt
- serialization format, one of "json", "msgpack" or "pickle" (Default value = "json")
offset_type
- store using the given offset type or keep the current if None (Default value = None)
offset_mapper
- nedded if the offset type should get changed (Default value = None)
mod
- module to use (Default value = "gatenlp.serialization.default")
**kwargs
- additional parameters for the format
Expand source code
def save_mem( self, fmt="json", offset_type=None, offset_mapper=None, mod="gatenlp.serialization.default", **kwargs, ): """ Serialize and save to a string. Additional keyword parameters for format "json": as_array: boolean, if True stores as array instead of dictionary, using to Args: fmt: serialization format, one of "json", "msgpack" or "pickle" (Default value = "json") offset_type: store using the given offset type or keep the current if None (Default value = None) offset_mapper: nedded if the offset type should get changed (Default value = None) mod: module to use (Default value = "gatenlp.serialization.default") **kwargs: additional parameters for the format """ m = importlib.import_module(mod) saver = m.get_changelog_saver(None, fmt) return saver( ChangeLog, self, to_mem=True, offset_type=offset_type, offset_mapper=offset_mapper, **kwargs, )
def to_dict(self, **kwargs)
-
Returns a dict representation of the ChangeLog.
Args
**kwargs
- ignored
Expand source code
def to_dict(self, **kwargs): """ Returns a dict representation of the ChangeLog. Args: **kwargs: ignored """ offset_type = self.offset_type changes = self.changes if "offset_type" in kwargs and kwargs["offset_type"] != offset_type: om = kwargs.get("offset_mapper") if om is None: raise Exception( "Need to convert offsets, but no offset_mapper parameter given" ) offset_type = kwargs["offset_type"] if offset_type == OFFSET_TYPE_JAVA: changes = self._fixup_changes(om.convert_to_java, replace=False) else: changes = self._fixup_changes(om.convert_to_python, replace=False) return {"changes": changes, "offset_type": offset_type}
class Document (text: str = None, features=None, changelog: ChangeLog = None)
-
Represent a GATE document. This is different from the original Java GATE representation in several ways:
-
the text is not mutable and can only be set at creation time, so there is no "edit" method
-
as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is no separate "FeatureMap" to store them
-
does not support listener callbacks
- there is no separate abstraction for "content", the only content possible is text which is a unicode string that can be acessed with the "text()" method
- Spans of text can be directly accessed using doc[from:to]
- Features may only have string keys and values which can be json-serialised
- Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets are UTF-16 Unicode code units
- Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index of UTF-16 code unit and back)
- No part of the document has to be present, not even the text (this allows saving just the annotations separately from the text)
- Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly)
Args
text
- the text of the document. The text can be None to indicate that no initial text should be set. Once
the text has been set for a document, it is immutable and cannot be changed. features: the initial document features to set, a sequence of key/value tuples changelog: a ChangeLog instance to use to log changes.
Returns:
Expand source code
class Document: """Represent a GATE document. This is different from the original Java GATE representation in several ways: * the text is not mutable and can only be set at creation time, so there is no "edit" method * as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is no separate "FeatureMap" to store them * does not support listener callbacks * there is no separate abstraction for "content", the only content possible is text which is a unicode string that can be acessed with the "text()" method * Spans of text can be directly accessed using doc[from:to] * Features may only have string keys and values which can be json-serialised * Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets are UTF-16 Unicode code units * Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index of UTF-16 code unit and back) * No part of the document has to be present, not even the text (this allows saving just the annotations separately from the text) * Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly) Args: text: the text of the document. The text can be None to indicate that no initial text should be set. Once the text has been set for a document, it is immutable and cannot be changed. features: the initial document features to set, a sequence of key/value tuples changelog: a ChangeLog instance to use to log changes. Returns: """ def __init__(self, text: str = None, features=None, changelog: ChangeLog = None): if text is not None: assert isinstance(text, str) if changelog is not None: assert isinstance(changelog, ChangeLog) self._changelog = changelog self._features = Features(features, logger=self._log_feature_change) self._annotation_sets = dict() self._text = text self.offset_type = OFFSET_TYPE_PYTHON self._name = "" @property def name(self): """ """ return self._name @name.setter def name(self, val): """ Args: val: Returns: """ if val is None: val = "" if not isinstance(val, str): raise Exception("Name must be a string") self._name = val if self._changelog is not None: ch = {"command": "name:set"} ch["name"] = val self._changelog.append(ch) def _ensure_type_python(self) -> None: """ """ if self.offset_type != OFFSET_TYPE_PYTHON: raise Exception( "Document cannot be used if it is not type PYTHON, use to_type(OFFSET_TYPE_PYTHON) first" ) def _fixup_annotations(self, method: Callable) -> None: """ Args: method: Callable: Returns: """ annset_names = self._annotation_sets.keys() for annset_name in annset_names: annset = self._annotation_sets[annset_name] if annset._annotations is not None: for ann in annset._annotations.values(): ann._start = method(ann._start) ann._end = method(ann._end) def to_offset_type(self, offsettype: str) -> OffsetMapper: """Convert all the offsets of all the annotations in this document to the required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets are already of that type, this does nothing. NOTE: if the document has a ChangeLog, it is NOT also converted! The method returns the offset mapper if anything actually was converted, otherwise None. Args: offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON offsettype: str: Returns: offset mapper or None """ om = None if offsettype == self.offset_type: return if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON: # convert from currently python to java om = OffsetMapper(self._text) self._fixup_annotations(om.convert_to_java) self.offset_type = OFFSET_TYPE_JAVA elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA: # convert from currently java to python om = OffsetMapper(self._text) self._fixup_annotations(om.convert_to_python) self.offset_type = OFFSET_TYPE_PYTHON else: raise Exception("Odd offset type") return om def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID): """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance, a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object. The document is modified in-place. Args: changes: one or more changes handle_existing_anns: what to do if the change from the changelog tries to add an annotation with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID) Returns: """ if isinstance(changes, dict): changes = [changes] elif isinstance(changes, ChangeLog): changes = changes.changes for change in changes: cmd = change.get("command") fname = change.get("feature") fvalue = change.get("value") features = change.get("features") sname = change.get("set") annid = change.get("id") if cmd is None: raise Exception("Change without field 'command'") if cmd == ACTION_ADD_ANNSET: assert sname is not None self.annset(sname) elif cmd == ACTION_ADD_ANN: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) start = change.get("start") end = change.get("end") anntype = change.get("type") if ann is None: anns.add(start, end, anntype, annid=annid, features=features) else: if handle_existing_anns == ADDANN_IGNORE: pass elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID: anns.add(start, end, anntype) elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION: anns.remove(annid) anns.add(start, end, anntype, annid) elif handle_existing_anns == ADDANN_UPDATE_FEATURES: ann.features.update(features) elif handle_existing_anns == ADDANN_REPLACE_FEATURES: ann.features.clear() ann.features.update(features) elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES: fns = ann.feature_names() for f in features.keys(): if f not in fns: ann.features[f] = features[f] elif cmd == ACTION_CLEAR_ANNS: assert sname is not None anns = self.annset(sname) anns.clear() elif cmd == ACTION_CLEAR_ANN_FEATURES: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) if ann is not None: ann.features.clear() else: pass # ignore, could happen with a detached annotation elif cmd == ACTION_CLEAR_DOC_FEATURES: self.features.clear() elif cmd == ACTION_SET_ANN_FEATURE: assert fname is not None assert sname is not None assert annid is not None ann = self.annset(sname).get(annid) ann.features[fname] = fvalue elif cmd == ACTION_DEL_ANN_FEATURE: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) if ann is not None: if fname is not None: ann.features.pop(fname, None) else: pass # ignore, could happen with a detached annotation elif cmd == ACTION_DEL_DOC_FEATURE: assert fname is not None self.features.pop(fname, None) elif cmd == ACTION_DEL_ANN: assert sname is not None assert annid is not None anns = self.annset(sname) anns.remove(annid) elif cmd == ACTION_SET_DOC_FEATURE: assert fname is not None self.features[fname] = fvalue elif cmd == ACTION_CLEAR_DOC_FEATURES: self._features.clear() elif cmd == ACTION_DEL_DOC_FEATURE: assert fname is not None del self._features[fname] else: raise Exception("Unknown ChangeLog action: ", cmd) @property def features(self): """Accesses the features as a FeatureViewer instance. Changes made on this object are reflected in the document and recorded in the change log, if there is one. :return: A FeatureViewer view of the document features. Args: Returns: """ return self._features @property def changelog(self): """Get the ChangeLog or None if no ChangeLog has been set. :return: the changelog Args: Returns: """ return self._changelog @changelog.setter def changelog(self, chlog): """Make the document use the given changelog to record all changes from this moment on. Args: chlog: the new changelog to use or None to not use any Returns: the changelog used previously or None """ oldchlog = self._changelog self._changelog = chlog return oldchlog @property def text(self) -> str: """Get the text of the document. For a partial document, the text may be None. :return: the text of the document Args: Returns: """ self._ensure_type_python() return self._text @text.setter def text(self, value: str) -> None: """Set the text of the document. This is only possible as long as it has not been set yet, after that, the text is immutable. Args: value: the text for the document value: str: Returns: """ if self._text is None: self._text = value else: raise NotImplementedError("Text cannot be modified") def _log_feature_change( self, command: str, feature: str = None, value=None ) -> None: """ Args: command: str: feature: str: (Default value = None) value: (Default value = None) Returns: """ if self._changelog is None: return command = "doc-" + command ch = {"command": command} if command == "doc-feature:set": ch["feature"] = feature ch["value"] = value self._changelog.append(ch) def __len__(self) -> int: """ Return the length of the text. Note: this will convert the type of the document to python! :return: the length of the document text """ self._ensure_type_python() if self._text is None: return 0 else: return len(self._text) def __getitem__(self, span) -> str: """ Get the text for the given span. :param span: a single number, an offset range of the form from:to or an annotation. If annotation, uses the annotation's offset span. :return: the text of the span """ self._ensure_type_python() if isinstance(span, Annotation): return self.text[span._start : span._end] if isinstance(span, AnnotationSet): return self.text[span.start() : span.end()] if hasattr(span, "start") and hasattr(span, "end"): return self.text[span.start, span.end] return self.text[span] def annset(self, name: str = "") -> AnnotationSet: """Get the named annotation set, if name is not given or the empty string, the default annotation set. If the annotation set does not already exist, it is created. Args: name: the annotation set name, the empty string is used for the "default annotation set". name: str: (Default value = "") Returns: the specified annotation set. """ self._ensure_type_python() if name not in self._annotation_sets: annset = AnnotationSet(owner_doc=self, name=name) self._annotation_sets[name] = annset if self._changelog: self._changelog.append({"command": "annotations:add", "set": name}) return annset else: return self._annotation_sets[name] def annset_names(self) -> KeysView[str]: """ Args: Returns: :return: annotation set names """ self._ensure_type_python() return list(self._annotation_sets.keys()) def remove_annset(self, name: str): """Completely remove the annotation set. Args: name: name of the annotation set to remove name: str: Returns: """ if name not in self._annotation_sets: raise Exception(f"AnnotationSet with name {name} does not exist") del self._annotation_sets[name] if self._changelog: self._changelog.append({"command": "annotations:remove", "set": name}) def __repr__(self) -> str: """ String representation of the document, showing all content. :return: string representation """ return "Document({},features={},anns={})".format( self.text, self._features, self._annotation_sets.__repr__() ) def __str__(self) -> str: asets = ( "[" + ",".join([f"'{k}':{len(v)}" for k, v in self._annotation_sets.items()]) + "]" ) return "Document({},features={},anns={})".format( self.text, self._features, asets ) def to_dict(self, offset_type=None, annsets=None, **kwargs): """Convert this instance to a dictionary that can be used to re-create the instance with from_dict. NOTE: if there is an active changelog, it is not included in the output as this field is considered a transient field! Args: offset_type: convert to the given offset type on the fly (Default value = None) annsets: if not None, a list of annotation set/type specifications: each element is either a string, the name of the annotation set to include, or a tuple where the first element is the annotation set name and the second element is either a type name or a list of type names. The same annotation set name should not be used in more than one specification. **kwargs: Returns: the dictionary representation of this instance """ # if the specified offset type is equal to what we have, do nothing, otherwise # create an offset mapper and pass it down to where we actually convert the annotations om = None if offset_type is not None: assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON if offset_type != self.offset_type: if self._text is not None: om = OffsetMapper(self._text) kwargs["offset_mapper"] = om kwargs["offset_type"] = offset_type else: offset_type = self.offset_type # create the annotation sets map if annsets is not None: annsets_dict = {} for spec in annsets: if isinstance(spec, str): annsets_dict[spec] = self._annotation_sets[spec].to_dict(**kwargs) else: setname, types = spec if isinstance(types, str): types = [types] annsets_dict[setname] = self._annotation_sets[setname].to_dict(anntypes=types, **kwargs) else: annsets_dict = { name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() } return { "annotation_sets": annsets_dict, "text": self._text, "features": self._features.to_dict(), "offset_type": offset_type, "name": self.name, } @staticmethod def from_dict(dictrepr, **kwargs): """Return a Document instance as represented by the dictionary dictrepr. Args: dictrepr: return: the initialized Document instance **kwargs: Returns: the initialized Document instance """ feats = dictrepr.get("features") doc = Document(dictrepr.get("text"), features=feats) doc.name = dictrepr.get("name") doc.offset_type = dictrepr.get("offset_type") if ( doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON ): raise Exception("Invalid offset type, cannot load: ", doc.offset_type) annsets = { name: AnnotationSet.from_dict(adict, owner_doc=doc) for name, adict in dictrepr.get("annotation_sets").items() } doc._annotation_sets = annsets return doc def save( self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", annsets=None, **kwargs, ): """Save the document to the destination file. Args: destination: either a file name or something that has a write(string) method. fmt: serialization format, by default the format is inferred from the file extension. offset_type: store using the given offset type or keep the current if None (Default value = None) mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default") annsets: if not None, a list of annotation set names or tuples of set name and a list of annotation types to include in the serialized document. kwargs: additional parameters for the document saver. **kwargs: """ if annsets is not None: kwargs["annsets"] = annsets if fmt is None or isinstance(fmt, str): m = importlib.import_module(mod) saver = m.get_document_saver(destination, fmt) saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs) else: # assume fmt is a callable to get used directly fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs) def save_mem( self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs, ): """Serialize to a string or bytes in the given format. Args: fmt: serialization format to use. (Default value = "json") offset_type: store using the given offset type or keep the current if None (Default value = None) mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional parameters for the format. **kwargs: Returns: """ if not fmt: raise Exception("Format required.") if isinstance(fmt, str): m = importlib.import_module(mod) saver = m.get_document_saver(None, fmt) return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs) else: fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs) @staticmethod def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs): """Load or import a document from the given source. The source can be a file path or file name or a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse the URL using urllib. Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)` Example: `Document.load(pathlib.Path(somepath), fmt=theformat)` NOTE: the offset type of the document is always converted to PYTHON when loading! Args: source: the URL or file path to load from. fmt: the format of the source. By default the format is inferred by the file extension. The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs". mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional format specific keyword arguments to pass to the loader **kwargs: Returns: the loaded document """ if fmt is None or isinstance(fmt, str): m = importlib.import_module(mod) loader = m.get_document_loader(source, fmt) doc = loader(Document, from_ext=source, **kwargs) else: doc = fmt(Document, from_ext=source, **kwargs) if doc.offset_type == OFFSET_TYPE_JAVA: doc.to_offset_type(OFFSET_TYPE_PYTHON) return doc @staticmethod def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs): """Create a document from the in-memory serialization in source. Source can be a string or bytes, depending on the format. Note: the offset type is always converted to PYTHON when loading! Args: source: the string/bytes to deserialize fmt: the format (Default value = "json") mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default") kwargs: additional arguments to pass to the loader **kwargs: Returns: """ if not fmt: raise Exception("Format required.") if isinstance(fmt, str): m = importlib.import_module(mod) loader = m.get_document_loader(None, fmt) doc = loader(Document, from_mem=source, **kwargs) else: doc = fmt(Document, from_mem=source, **kwargs) if doc.offset_type == OFFSET_TYPE_JAVA: doc.to_offset_type(OFFSET_TYPE_PYTHON) return doc def __copy__(self): """ Creates a shallow copy except the changelog which is set to None. :return: shallow copy of the document """ doc = Document(self._text) doc._annotation_sets = self._annotation_sets doc.offset_type = self.offset_type doc._features = self._features.copy() return doc def copy(self): """Creates a shallow copy except the changelog which is set to None. :return: shallow copy of the document Args: Returns: """ return self.__copy__() def __deepcopy__(self, memo): """ Creates a deep copy, except the changelog which is set to None. :param memo: the memoization dictionary to use. :return: a deep copy of the document. """ if self._features is not None: fts = lib_copy.deepcopy(self._features.to_dict(), memo) else: fts = None doc = Document(self._text, features=fts) doc._changelog = None doc._annotation_sets = lib_copy.deepcopy(self._annotation_sets, memo) doc.offset_type = self.offset_type return doc def deepcopy(self, memo=None): """Creates a deep copy, except the changelog which is set to None. Args: memo: the memoization dictionary to use. Returns: a deep copy of the document. """ return lib_copy.deepcopy(self, memo=memo) def _repr_html_(self): """ Render function for Jupyter notebooks. Returns the html-ann-viewer HTML. This renders the HTML for notebook, for offline mode, but does not add the JS but instead initializes the JS in the notebook unless gatenlp.init_notebook() has bee called already. """ return self._notebook_show() # TODO: maybe allow manual selection of how to show the document, e.g. also by # writing to a tmp file and browsing in a browser, or pprint etc. def show(self, htmlid=None, annsets=None): """ Show the document in a Jupyter notebook. This allows to assign a specific htmlid so the generated HTML can be directly styled afterwards. This directly sends the rendered document to the cell (no display/HTML necessary). Args: htmlid: the HTML id prefix to use for classes and element ids. annsets: if not None, a list of annotation set/type specifications. Each element is either the name of a set to fully include, or a tuple with the name of the set as the first element and with a single type name or a list of type names as the second element """ if in_notebook(): self._notebook_show(htmlid=htmlid, display=True, annsets=annsets) else: return self.__str__() def _notebook_show(self, htmlid=None, display=False, annsets=None): from gatenlp.gatenlpconfig import gatenlpconfig from gatenlp.serialization.default import HtmlAnnViewerSerializer from IPython.display import display_html if not gatenlpconfig.notebook_js_initialized: HtmlAnnViewerSerializer.init_javscript() gatenlpconfig.notebook_js_initialized = True html = self.save_mem( fmt="html-ann-viewer", notebook=True, add_js=False, offline=True, htmlid=htmlid, annsets=annsets, ) if display: display_html(html, raw=True) else: return html
Subclasses
Static methods
def from_dict(dictrepr, **kwargs)
-
Return a Document instance as represented by the dictionary dictrepr.
Args
dictrepr
- return: the initialized Document instance
**kwargs:
Returns
the initialized Document instance
Expand source code
@staticmethod def from_dict(dictrepr, **kwargs): """Return a Document instance as represented by the dictionary dictrepr. Args: dictrepr: return: the initialized Document instance **kwargs: Returns: the initialized Document instance """ feats = dictrepr.get("features") doc = Document(dictrepr.get("text"), features=feats) doc.name = dictrepr.get("name") doc.offset_type = dictrepr.get("offset_type") if ( doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON ): raise Exception("Invalid offset type, cannot load: ", doc.offset_type) annsets = { name: AnnotationSet.from_dict(adict, owner_doc=doc) for name, adict in dictrepr.get("annotation_sets").items() } doc._annotation_sets = annsets return doc
def load(source, fmt=None, mod='gatenlp.serialization.default', **kwargs)
-
Load or import a document from the given source. The source can be a file path or file name or a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse the URL using urllib.
Example:
Document.load(urllib.parse.urlparse(someurl), fmt=theformat)
Example:
Document.load(pathlib.Path(somepath), fmt=theformat)
NOTE: the offset type of the document is always converted to PYTHON when loading!
Args
source
- the URL or file path to load from.
fmt
- the format of the source. By default the format is inferred by the file extension.
The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs". mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional format specific keyword arguments to pass to the loader **kwargs:
Returns
the loaded document
Expand source code
@staticmethod def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs): """Load or import a document from the given source. The source can be a file path or file name or a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse the URL using urllib. Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)` Example: `Document.load(pathlib.Path(somepath), fmt=theformat)` NOTE: the offset type of the document is always converted to PYTHON when loading! Args: source: the URL or file path to load from. fmt: the format of the source. By default the format is inferred by the file extension. The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs". mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional format specific keyword arguments to pass to the loader **kwargs: Returns: the loaded document """ if fmt is None or isinstance(fmt, str): m = importlib.import_module(mod) loader = m.get_document_loader(source, fmt) doc = loader(Document, from_ext=source, **kwargs) else: doc = fmt(Document, from_ext=source, **kwargs) if doc.offset_type == OFFSET_TYPE_JAVA: doc.to_offset_type(OFFSET_TYPE_PYTHON) return doc
def load_mem(source, fmt='json', mod='gatenlp.serialization.default', **kwargs)
-
Create a document from the in-memory serialization in source. Source can be a string or bytes, depending on the format.
Note: the offset type is always converted to PYTHON when loading!
Args
source
- the string/bytes to deserialize
fmt
- the format (Default value = "json")
mod
- the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
kwargs
- additional arguments to pass to the loader
**kwargs: Returns:
Expand source code
@staticmethod def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs): """Create a document from the in-memory serialization in source. Source can be a string or bytes, depending on the format. Note: the offset type is always converted to PYTHON when loading! Args: source: the string/bytes to deserialize fmt: the format (Default value = "json") mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default") kwargs: additional arguments to pass to the loader **kwargs: Returns: """ if not fmt: raise Exception("Format required.") if isinstance(fmt, str): m = importlib.import_module(mod) loader = m.get_document_loader(None, fmt) doc = loader(Document, from_mem=source, **kwargs) else: doc = fmt(Document, from_mem=source, **kwargs) if doc.offset_type == OFFSET_TYPE_JAVA: doc.to_offset_type(OFFSET_TYPE_PYTHON) return doc
Instance variables
property/get/set changelog
-
Get the ChangeLog or None if no ChangeLog has been set.
:return: the changelog
Args:
Returns:
Expand source code
@property def changelog(self): """Get the ChangeLog or None if no ChangeLog has been set. :return: the changelog Args: Returns: """ return self._changelog
property/get features
-
Accesses the features as a FeatureViewer instance. Changes made on this object are reflected in the document and recorded in the change log, if there is one.
:return: A FeatureViewer view of the document features.
Args:
Returns:
Expand source code
@property def features(self): """Accesses the features as a FeatureViewer instance. Changes made on this object are reflected in the document and recorded in the change log, if there is one. :return: A FeatureViewer view of the document features. Args: Returns: """ return self._features
property/get/set name
-
Expand source code
@property def name(self): """ """ return self._name
property/get/set text : str
-
Get the text of the document. For a partial document, the text may be None.
:return: the text of the document
Args:
Returns:
Expand source code
@property def text(self) -> str: """Get the text of the document. For a partial document, the text may be None. :return: the text of the document Args: Returns: """ self._ensure_type_python() return self._text
Methods
def annset(self, name: str = '') ‑> AnnotationSet
-
Get the named annotation set, if name is not given or the empty string, the default annotation set. If the annotation set does not already exist, it is created.
Args
name
- the annotation set name, the empty string is used for the "default annotation set".
name
- str: (Default value = "")
Returns
the specified annotation set.
Expand source code
def annset(self, name: str = "") -> AnnotationSet: """Get the named annotation set, if name is not given or the empty string, the default annotation set. If the annotation set does not already exist, it is created. Args: name: the annotation set name, the empty string is used for the "default annotation set". name: str: (Default value = "") Returns: the specified annotation set. """ self._ensure_type_python() if name not in self._annotation_sets: annset = AnnotationSet(owner_doc=self, name=name) self._annotation_sets[name] = annset if self._changelog: self._changelog.append({"command": "annotations:add", "set": name}) return annset else: return self._annotation_sets[name]
def annset_names(self) ‑> KeysView[str]
-
Args:
Returns
:return: annotation set names
Expand source code
def annset_names(self) -> KeysView[str]: """ Args: Returns: :return: annotation set names """ self._ensure_type_python() return list(self._annotation_sets.keys())
def apply_changes(self, changes, handle_existing_anns='add-with-new-id')
-
Apply changes from a ChangeLog to this document.
changes
can be a ChangeLog instance, a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.The document is modified in-place.
Args
changes
- one or more changes
handle_existing_anns
- what to do if the change from the changelog tries to add an annotation
with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)
Returns:
Expand source code
def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID): """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance, a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object. The document is modified in-place. Args: changes: one or more changes handle_existing_anns: what to do if the change from the changelog tries to add an annotation with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID) Returns: """ if isinstance(changes, dict): changes = [changes] elif isinstance(changes, ChangeLog): changes = changes.changes for change in changes: cmd = change.get("command") fname = change.get("feature") fvalue = change.get("value") features = change.get("features") sname = change.get("set") annid = change.get("id") if cmd is None: raise Exception("Change without field 'command'") if cmd == ACTION_ADD_ANNSET: assert sname is not None self.annset(sname) elif cmd == ACTION_ADD_ANN: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) start = change.get("start") end = change.get("end") anntype = change.get("type") if ann is None: anns.add(start, end, anntype, annid=annid, features=features) else: if handle_existing_anns == ADDANN_IGNORE: pass elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID: anns.add(start, end, anntype) elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION: anns.remove(annid) anns.add(start, end, anntype, annid) elif handle_existing_anns == ADDANN_UPDATE_FEATURES: ann.features.update(features) elif handle_existing_anns == ADDANN_REPLACE_FEATURES: ann.features.clear() ann.features.update(features) elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES: fns = ann.feature_names() for f in features.keys(): if f not in fns: ann.features[f] = features[f] elif cmd == ACTION_CLEAR_ANNS: assert sname is not None anns = self.annset(sname) anns.clear() elif cmd == ACTION_CLEAR_ANN_FEATURES: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) if ann is not None: ann.features.clear() else: pass # ignore, could happen with a detached annotation elif cmd == ACTION_CLEAR_DOC_FEATURES: self.features.clear() elif cmd == ACTION_SET_ANN_FEATURE: assert fname is not None assert sname is not None assert annid is not None ann = self.annset(sname).get(annid) ann.features[fname] = fvalue elif cmd == ACTION_DEL_ANN_FEATURE: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) if ann is not None: if fname is not None: ann.features.pop(fname, None) else: pass # ignore, could happen with a detached annotation elif cmd == ACTION_DEL_DOC_FEATURE: assert fname is not None self.features.pop(fname, None) elif cmd == ACTION_DEL_ANN: assert sname is not None assert annid is not None anns = self.annset(sname) anns.remove(annid) elif cmd == ACTION_SET_DOC_FEATURE: assert fname is not None self.features[fname] = fvalue elif cmd == ACTION_CLEAR_DOC_FEATURES: self._features.clear() elif cmd == ACTION_DEL_DOC_FEATURE: assert fname is not None del self._features[fname] else: raise Exception("Unknown ChangeLog action: ", cmd)
def copy(self)
-
Creates a shallow copy except the changelog which is set to None.
:return: shallow copy of the document
Args:
Returns:
Expand source code
def copy(self): """Creates a shallow copy except the changelog which is set to None. :return: shallow copy of the document Args: Returns: """ return self.__copy__()
def deepcopy(self, memo=None)
-
Creates a deep copy, except the changelog which is set to None.
Args
memo
- the memoization dictionary to use.
Returns
a deep copy of the document.
Expand source code
def deepcopy(self, memo=None): """Creates a deep copy, except the changelog which is set to None. Args: memo: the memoization dictionary to use. Returns: a deep copy of the document. """ return lib_copy.deepcopy(self, memo=memo)
def remove_annset(self, name: str)
-
Completely remove the annotation set.
Args
name
- name of the annotation set to remove
name
- str:
Returns:
Expand source code
def remove_annset(self, name: str): """Completely remove the annotation set. Args: name: name of the annotation set to remove name: str: Returns: """ if name not in self._annotation_sets: raise Exception(f"AnnotationSet with name {name} does not exist") del self._annotation_sets[name] if self._changelog: self._changelog.append({"command": "annotations:remove", "set": name})
def save(self, destination, fmt=None, offset_type=None, mod='gatenlp.serialization.default', annsets=None, **kwargs)
-
Save the document to the destination file.
Args
destination
- either a file name or something that has a write(string) method.
fmt
- serialization format, by default the format is inferred from the file extension.
offset_type
- store using the given offset type or keep the current if None (Default value = None)
mod
- module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
annsets
- if not None, a list of annotation set names or tuples of set name and a list of annotation types to include in the serialized document.
kwargs
- additional parameters for the document saver.
**kwargs:
Expand source code
def save( self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", annsets=None, **kwargs, ): """Save the document to the destination file. Args: destination: either a file name or something that has a write(string) method. fmt: serialization format, by default the format is inferred from the file extension. offset_type: store using the given offset type or keep the current if None (Default value = None) mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default") annsets: if not None, a list of annotation set names or tuples of set name and a list of annotation types to include in the serialized document. kwargs: additional parameters for the document saver. **kwargs: """ if annsets is not None: kwargs["annsets"] = annsets if fmt is None or isinstance(fmt, str): m = importlib.import_module(mod) saver = m.get_document_saver(destination, fmt) saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs) else: # assume fmt is a callable to get used directly fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
def save_mem(self, fmt='json', offset_type=None, mod='gatenlp.serialization.default', **kwargs)
-
Serialize to a string or bytes in the given format.
Args
fmt
- serialization format to use. (Default value = "json")
offset_type
- store using the given offset type or keep the current if None (Default value = None)
mod
- module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
kwargs
- additional parameters for the format.
**kwargs: Returns:
Expand source code
def save_mem( self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs, ): """Serialize to a string or bytes in the given format. Args: fmt: serialization format to use. (Default value = "json") offset_type: store using the given offset type or keep the current if None (Default value = None) mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional parameters for the format. **kwargs: Returns: """ if not fmt: raise Exception("Format required.") if isinstance(fmt, str): m = importlib.import_module(mod) saver = m.get_document_saver(None, fmt) return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs) else: fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
def show(self, htmlid=None, annsets=None)
-
Show the document in a Jupyter notebook. This allows to assign a specific htmlid so the generated HTML can be directly styled afterwards. This directly sends the rendered document to the cell (no display/HTML necessary).
Args
htmlid
- the HTML id prefix to use for classes and element ids.
annsets
- if not None, a list of annotation set/type specifications. Each element is either the name of a set to fully include, or a tuple with the name of the set as the first element and with a single type name or a list of type names as the second element
Expand source code
def show(self, htmlid=None, annsets=None): """ Show the document in a Jupyter notebook. This allows to assign a specific htmlid so the generated HTML can be directly styled afterwards. This directly sends the rendered document to the cell (no display/HTML necessary). Args: htmlid: the HTML id prefix to use for classes and element ids. annsets: if not None, a list of annotation set/type specifications. Each element is either the name of a set to fully include, or a tuple with the name of the set as the first element and with a single type name or a list of type names as the second element """ if in_notebook(): self._notebook_show(htmlid=htmlid, display=True, annsets=annsets) else: return self.__str__()
def to_dict(self, offset_type=None, annsets=None, **kwargs)
-
Convert this instance to a dictionary that can be used to re-create the instance with from_dict. NOTE: if there is an active changelog, it is not included in the output as this field is considered a transient field!
Args
offset_type
- convert to the given offset type on the fly (Default value = None)
annsets
- if not None, a list of annotation set/type specifications: each element is either a string, the name of the annotation set to include, or a tuple where the first element is the annotation set name and the second element is either a type name or a list of type names. The same annotation set name should not be used in more than one specification.
**kwargs:
Returns
the dictionary representation of this instance
Expand source code
def to_dict(self, offset_type=None, annsets=None, **kwargs): """Convert this instance to a dictionary that can be used to re-create the instance with from_dict. NOTE: if there is an active changelog, it is not included in the output as this field is considered a transient field! Args: offset_type: convert to the given offset type on the fly (Default value = None) annsets: if not None, a list of annotation set/type specifications: each element is either a string, the name of the annotation set to include, or a tuple where the first element is the annotation set name and the second element is either a type name or a list of type names. The same annotation set name should not be used in more than one specification. **kwargs: Returns: the dictionary representation of this instance """ # if the specified offset type is equal to what we have, do nothing, otherwise # create an offset mapper and pass it down to where we actually convert the annotations om = None if offset_type is not None: assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON if offset_type != self.offset_type: if self._text is not None: om = OffsetMapper(self._text) kwargs["offset_mapper"] = om kwargs["offset_type"] = offset_type else: offset_type = self.offset_type # create the annotation sets map if annsets is not None: annsets_dict = {} for spec in annsets: if isinstance(spec, str): annsets_dict[spec] = self._annotation_sets[spec].to_dict(**kwargs) else: setname, types = spec if isinstance(types, str): types = [types] annsets_dict[setname] = self._annotation_sets[setname].to_dict(anntypes=types, **kwargs) else: annsets_dict = { name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() } return { "annotation_sets": annsets_dict, "text": self._text, "features": self._features.to_dict(), "offset_type": offset_type, "name": self.name, }
def to_offset_type(self, offsettype: str) ‑> OffsetMapper
-
Convert all the offsets of all the annotations in this document to the required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets are already of that type, this does nothing.
NOTE: if the document has a ChangeLog, it is NOT also converted!
The method returns the offset mapper if anything actually was converted, otherwise None.
Args
offsettype
- either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
offsettype
- str:
Returns
offset mapper or None
Expand source code
def to_offset_type(self, offsettype: str) -> OffsetMapper: """Convert all the offsets of all the annotations in this document to the required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets are already of that type, this does nothing. NOTE: if the document has a ChangeLog, it is NOT also converted! The method returns the offset mapper if anything actually was converted, otherwise None. Args: offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON offsettype: str: Returns: offset mapper or None """ om = None if offsettype == self.offset_type: return if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON: # convert from currently python to java om = OffsetMapper(self._text) self._fixup_annotations(om.convert_to_java) self.offset_type = OFFSET_TYPE_JAVA elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA: # convert from currently java to python om = OffsetMapper(self._text) self._fixup_annotations(om.convert_to_python) self.offset_type = OFFSET_TYPE_PYTHON else: raise Exception("Odd offset type") return om
-