Source code for gatenlp.offsetmapper
import numbers
OFFSET_TYPE_JAVA = "j"
OFFSET_TYPE_PYTHON = "p"
[docs]class OffsetMapper:
def __init__(self, text: str):
"""
Calculate the tables for mapping unicode code points to utf16 code units.
NOTE: currently this optimizes for conversion speed at the cost of memory, with one special case:
if after creating the java2python table we find that all offsets are identical, we discard
the tables and just set a flag for that.
:param text: the text as a python string
"""
# for now, remove dependency on numpy and use simple python lists of integers
# import numpy as np
cur_java_off = 0
python2java_list = [0]
java2python_list = []
last = len(text)-1
for i, c in enumerate(text):
# get the java size of the current character
width = int(len(c.encode("utf-16be"))/2)
assert width == 1 or width == 2
# the next java offset we get by incrementing the java offset by the with of the current char
cur_java_off += width
if i != last:
python2java_list.append(cur_java_off)
# i is the current python offset, so we append as many times to java2python_list as we have width
java2python_list.append(i)
if width == 2:
java2python_list.append(i)
if len(java2python_list) == len(text):
self.python2java = None
self.java2python = None
self.bijective = len(text)
else:
python2java_list.append(python2java_list[-1]+1)
# self.python2java = np.array(python2java_list, np.int32)
self.python2java = python2java_list
# self.java2python = np.array(java2python_list, np.int32)
java2python_list.append(java2python_list[-1]+1)
self.java2python = java2python_list
self.bijective = None # if we have identical offsets, this is set to the length of the text instead
def _convert_from(self, offsets, from_table=None):
if from_table is None:
return offsets
if isinstance(offsets, numbers.Integral):
return int(from_table[offsets])
ret = []
for offset in offsets:
ret.append(int(from_table[offset]))
return ret
[docs] def convert_to_python(self, offsets):
"""
Convert one java offset or an iterable of java offsets to python offset/s
:param offsets: a single offset or an iterable of offsets
:return:
"""
return self._convert_from(offsets, from_table=self.java2python)
[docs] def convert_to_java(self, offsets):
"""
Convert one python offset or an iterable of python offsets to java offset/s
:param offsets: a single offset or an iterable of offsets
:return:
"""
return self._convert_from(offsets, from_table=self.python2java)