Source code for gatenlp.impl.sortedintvls

"""
A simple class that represents a collection of sorted intervals and allows for some basic interval-based
operations. Internally this stores the intervals using standard sorted lists. This is not optimal and my
incur a O(n) overhead on some operations depending on the result set. It also may incur a significant overhead
for creating and maintaning the sorted lists.
NOTE: this stores a tuple (start, end, object) in the sorted list and uses a key function that returns the offset
for sorting.
"""

from sortedcontainers import SortedKeyList


[docs]class SortedIntvls: def __init__(self): # NOTE: we sort by increasing start offset and decreasing end offset for this self._by_start = SortedKeyList(key=lambda x: x[0]) # for this we only sort by the end offset self._by_end = SortedKeyList(key=lambda x: x[1])
[docs] def add(self, start, end, data): self._by_start.add((start, end, data)) self._by_end.add((start, end, data))
[docs] def update(self, tupleiterable): self._by_start.update(tupleiterable) self._by_end.update(tupleiterable)
[docs] def remove(self, start, end, data): self._by_start.remove((start, end, data)) self._by_end.remove((start, end, data))
[docs] def discard(self, start, end, data): self._by_start.discard((start, end, data)) self._by_end.discard((start, end, data))
def __len__(self): return len(self._by_start)
[docs] def starting_at(self, offset): """ Return an iterable of (start, end, data) tuples where start==offset :param offset: the starting offset :return: """ return self._by_start.irange_key(min_key=offset, max_key=offset)
[docs] def ending_at(self, offset): """ Return an iterable of (start, end, data) tuples where end==offset :param offset: the ending offset :return: """ return self._by_end.irange_key(min_key=offset, max_key=offset)
[docs] def at(self, start, end): """ Return iterable of tuples where start==start and end==end :param start: :param end: :return: """ for intvl in self._by_start.irange_key(min_key=start, max_key=start): if intvl[1] == end: yield intvl
# SAME as within
[docs] def within(self, start, end): """ Return intervals which are fully contained within start...end :param start: :param end: :return: """ # get all the intervals that start within the range, then keep those which also end within the range for intvl in self._by_start.irange_key(min_key=start, max_key=end): if intvl[1] <= end: yield intvl
[docs] def starting_from(self, offset): """ Intervals that start at or after offset. :param offset: :return: """ return self._by_start.irange_key(min_key=offset)
[docs] def starting_before(self, offset): """ Intervals that start before offset :param offset: :return: """ return self._by_start.irange_key(max_key=offset-1)
[docs] def ending_to(self, offset): """ Intervals that end before or at the given end offset. NOTE: the result is sorted by end offset, not start offset! :param offset: :return: """ return self._by_end.irange_key(max_key=offset)
[docs] def ending_after(self, offset): """ Intervals the end after the given offset NOTE: the result is sorted by end offset! :param offset: :return: """ return self._by_end.irange_key(min_key=offset+1)
# SAME as covering
[docs] def covering(self, start, end): """ Intervals that contain the given range :param start: :param end: :return: """ # All intervals that start at or before the start and end at or after the end offset # we do this by first getting the intervals the start before or atthe start # then filtering by end for intvl in self._by_start.irange_key(max_key=start): if intvl[1] >= end: yield intvl
[docs] def overlapping(self, start, end): """ Intervals that overlap with the given range. :param start: :param end: :return: """ # All intervals where the start offset is before the end and the end offset is after the start # plus all intervals where the start offset is after the start but before the end # and the end offset is after the end for intvl in self._by_start.irange_key(max_key=end-1): if intvl[1] > start+1: yield intvl
[docs] def firsts(self): """ Return an iterator of all intervals at the minimum start offset that exists. :return: """ laststart = None # logger.info("DEBUG: set laststart to None") for intvl in self._by_start.irange_key(): # logger.info("DEBUG: checking interval {}".format(intvl)) if laststart is None: laststart = intvl[0] # logger.info("DEBUG: setting laststart to {} and yielding {}".format(intvl[0], intvl)) yield intvl elif intvl[0] == laststart: # logger.info("DEBUG: yielding {}".format(intvl)) yield intvl else: # logger.info("DEBUG: returning since we got {}".format(intvl)) return
[docs] def lasts(self): """ Return an iterator of all intervals at the maximum start offset that exists. :return: """ laststart = None for intvl in reversed(self._by_start): if laststart is None: laststart = intvl[0] yield intvl elif intvl[0] == laststart: yield intvl else: return
[docs] def min_start(self): """ Returns the smallest start offset we have :return: """ return self._by_start[0][0]
[docs] def max_end(self): """ Returns the biggest end offset we have :return: """ return self._by_end[-1][1]
[docs] def irange(self, minoff=None, maxoff=None, reverse=False): return self._by_start.irange_key(min_key=minoff, max_key=maxoff, reverse=reverse)
def __repr__(self): return "SortedIntvls({},{})".format(self._by_start, self._by_end)