Module gatenlp.processing.client
Module that provides various Annotators which act as clients to REST annotation services.
Expand source code
"""
Module that provides various Annotators which act as clients to REST annotation services.
"""
import logging
import json
from gatenlp.processing.annotator import Annotator
import requests
from requests.auth import HTTPBasicAuth
from gatenlp.utils import init_logger
import time
from gatenlp.offsetmapper import OffsetMapper
# TODO:
# * support compression send/receive
# * send GATE XML for existing annotations (requires GATE XML serialization writer)
# * send raw HTML or other formats support by the endpoint instead "doc" (which so far is just text)
# * maybe support the 100-continue protocol so far we dont
# * ERROR HANDLING: raise exception vs return None?
class GateCloudAnnotator(Annotator):
"""
This annotator sends the text of a document to a GATE Cloud (https://cloud.gate.ac.uk/) endpoint and uses the
returned result to create annotations.
"""
def __init__(
self,
api_key=None,
api_password=None,
url=None,
ann_types=None,
map_types=None,
out_annset="",
min_delay_ms=501,
):
"""
Create a GateCloudAnnotator.
Args:
api_key: API key needed to authenticate. Some services can be used in a limited way without
authentication.
api_password: API password needed to authenticale.
url: the URL of the annotation service endpoint, shown on the GATE Cloud page for the service
ann_types: this can be used to let the service annotate fewer or more than the default list of annotation
types. The default list and all possible annotations are shown on the GATE Cloud page for the service.
Either a string with comma separated annotation types preceded by a colon (e.g. ":Person,:Location")
or a python list with those type names (e.g. [":Person", ":Location"]). If the list contains type names
without a leading colon, the colon is added.
map_types: a dict which maps the annotation types from the service to arbitrary new annotation types,
any type name not in the map will remain unchanged.
out_annset: the annotation set in which to store the annotations
min_delay_ms: minimum time in milliseconds between two subsequent requests to the server
"""
self.api_key = api_key
self.api_password = api_password
self.url = url
self.map_types = map_types
self.min_delay_s = min_delay_ms / 1000.0
self.out_annset = out_annset
if ann_types:
if isinstance(ann_types, str):
self.ann_types = ann_types
elif isinstance(ann_types, list):
self.ann_types = ",".join(
[at if at.startswith(":") else ":" + at for at in ann_types]
)
else:
raise Exception(
"ann_types mist be a string of types like ':Person,:Location' or a list of types"
)
else:
self.ann_types = None
self.logger = init_logger()
self.logger.setLevel(logging.DEBUG)
self._last_call_time = 0
def __call__(self, doc, **kwargs):
delay = time.time() - self._last_call_time
if delay < self.min_delay_s:
time.sleep(self.min_delay_s - delay)
if "url" in kwargs:
url = kwargs["url"]
else:
url = self.url
text = doc.text
hdrs = {
"Content-Type": "text/plain; charset=UTF-8",
"Accept": "application/gate+json",
}
params = {}
if self.ann_types:
params["annotations"] = self.ann_types
# NOTE: not sure when this is needed, for now, disabled
# next_annid = doc.annset(self.out_annset)._next_annid
# params["nextAnnotationId"] = str(next_annid)
# self.logger.debug(f"Sending text={text}, params={params}")
if self.api_key:
response = requests.post(
url,
data=text.encode("utf-8"),
headers=hdrs,
params=params,
auth=HTTPBasicAuth(self.api_key, self.api_password),
)
else:
response = requests.post(
url, data=text.encode("utf-8"), headers=hdrs, params=params
)
scode = response.status_code
if scode != 200:
raise Exception(f"Something went wrong, received status code {scode}")
json = response.json()
ents = json.get("entities", {})
annset = doc.annset(self.out_annset)
for typename, anns in ents.items():
for anndata in anns:
feats = {}
start, end = (
None,
None,
) # cause an exception if the return data does not have indices
for fname, fval in anndata.items():
if fname == "indices":
start, end = fval[0], fval[1]
else:
feats[fname] = fval
if self.map_types:
typename = self.map_types.get(typename, typename)
# self.logger.debug(f"Adding annotation {start},{start},{typename},{feats}")
annset.add(start, end, typename, features=feats)
return doc
class TagMeAnnotator(Annotator):
"""
An annotator that sends text to the TagMe Annotation service (https://sobigdata.d4science.org/group/tagme/tagme)
and uses the result to annotate the document.
"""
def __init__(
self,
lang="en",
ann_type="Mention",
auth_token=None,
url=None,
task="tag", # or spot
out_annset="",
min_delay_ms=501,
tweet=False,
include_all_spots=False,
long_text=None,
epsilon=None,
link_pattern="https://{0}.wikipedia.org/wiki/{1}",
):
"""
Create a TagMeAnnotator.
Args:
lang: the language of the text, one of 'de', 'en' (default), 'it'
ann_type: the annotation type for the new annotations, default is "Mention"
auth_token: the authentication token needed to use the service
url: the annotation service endpoint, is None, the default endpoint for the task (spot or tag) is used
task: one of "spot" (only find mentions) or "tag" (find mentions and link), default is "tag"
out_annset: the annotationset to put the new annotations in
min_delay_ms: minimum time in ms to wait between requests to the server
tweet: if True, TagMe expects a Tweet (default is False)
include_all_spots: if True, include spots that cannot be linked (default is False)
long_text: if not None, the context length to use (default: None)
epsilon: if not None, the epsilong value (float) to use (default: None)
link_pattern: the URL pattern to use to turn the "title" returned from TagMe into an actual link. The
default is "https://{0}.wikipedia.org/wiki/{1}" where {0} gets replaced with the language code and
{1} gets replaced with the title.
"""
if url is None:
if task == "tag":
url = "https://tagme.d4science.org/tagme/tag"
elif task == "spot":
url = "https://tagme.d4science.org/tagme/spot"
else:
raise Exception("task must be 'tag' or 'spot'")
assert lang in ["en", "de", "it"]
if long_text is not None:
assert isinstance(long_text, int)
if epsilon is not None:
assert isinstance(epsilon, float)
self.long_text = long_text
self.epsilon = epsilon
self.lang = lang
self.auth_token = auth_token
self.url = url
self.tweet = tweet
self.include_all_spots = include_all_spots
self.out_annset = out_annset
self.min_delay_s = min_delay_ms / 1000.0
self.logger = init_logger()
# self.logger.setLevel(logging.DEBUG)
self._last_call_time = 0
self.ann_type = ann_type
self.link_pattern = link_pattern
def __call__(self, doc, **kwargs):
if "tweet" in kwargs:
tweet = kwargs["tweet"]
else:
tweet = self.tweet
delay = time.time() - self._last_call_time
if delay < self.min_delay_s:
time.sleep(self.min_delay_s - delay)
text = doc.text
hdrs = {
"Content-Type": "text/plain; charset=UTF-8",
"Accept": "application/gate+json",
}
params = {
"text": text,
"gcube-token": self.auth_token,
"lang": self.lang,
}
if self.include_all_spots:
params["include_all_spots"] = "true"
if tweet:
params["tweet"] = "true"
if self.long_text is not None:
params["long_text"] = self.long_text
if self.epsilon is not None:
params["epsilon"] = self.epsilon
response = requests.post(self.url, params=params, headers=hdrs)
scode = response.status_code
if scode != 200:
raise Exception(f"Something went wrong, received status code {scode}")
json = response.json()
# self.logger.debug(f"Response JSON: {json}")
ents = json.get("annotations", {})
annset = doc.annset(self.out_annset)
om = OffsetMapper(text)
for ent in ents:
start = ent["start"]
end = ent["end"]
start, end = om.convert_to_python([start, end])
feats = {}
title = ent.get("title")
if title is not None:
if self.link_pattern:
feats["url"] = self.link_pattern.format(self.lang, title)
else:
feats["title"] = title
for fname in ["id", "rho", "link_probability", "lp"]:
fval = ent.get(fname)
if fval is not None:
feats[fname] = fval
# self.logger.debug(f"Adding annotation {start},{end},{feats}")
annset.add(start, end, self.ann_type, features=feats)
return doc
class TextRazorTextAnnotator(Annotator):
"""
An annotator that sends document text to the TextRazor Annotation service (https://www.textrazor.com/)
and uses the result to annotate the document.
NOTE: this annotator and how it can get parametrized will still change!
"""
def __init__(
self,
lang=None, # if None/not specified, TextRazor auto-detects
auth_token=None,
url=None, # use default
extractors=None,
out_annset="",
min_delay_ms=501,
):
"""
Create a TextRazorTextAnnotator.
Args:
lang: if specified, override the auto-detected language of the text
auth_token: the authentication token needed to use the service
url: the annotation service endpoint, is None, the default endpoint https://api.textrazor.com is used
extractors: a list of extractor names or a string with comma-separated extractor names to add to the
minimum extractors (words, sentences). If None uses words, sentences, entities.
NOTE: currently only words, sentences, entities is supported.!
out_annset: the annotationset to put the new annotations in
min_delay_ms: minimum time in ms to wait between requests to the server
"""
if url is None:
url = "https://api.textrazor.com"
self.url = url
self.lang = lang
self.out_annset = out_annset
self.auth_token = auth_token
self.min_delay_s = min_delay_ms / 1000.0
self.logger = init_logger()
self.logger.setLevel(logging.DEBUG)
self._last_call_time = 0
if extractors is not None:
if isinstance(extractors, str):
extractors = extractors.split(",")
if isinstance(extractors, list):
allextrs = set()
allextrs.update(extractors)
allextrs.update(["words", "sentences"])
self.extractors = ",".join(list(allextrs))
else:
raise Exception("Odd extractors, must be list of strings or string")
else:
self.extractors = "words,sentences,entities"
def __call__(self, doc, **kwargs):
delay = time.time() - self._last_call_time
if delay < self.min_delay_s:
time.sleep(self.min_delay_s - delay)
text = doc.text
hdrs = {
# 'Content-Type': 'text/plain; charset=UTF-8',
# 'Accept-encoding': 'gzip' # TODO: to enable compressed responses
# 'Content-encoding': 'gzip' # TODO: to enable compressed requests
"X-TextRazor-Key": self.auth_token
}
data = {"text": text.encode("UTF-8")}
if self.extractors:
data["extractors"] = self.extractors
if self.lang:
data["languageOverride"] = self.lang
self.logger.debug(f"Sending request to {self.url}, data={data}, headers={hdrs}")
response = requests.post(
self.url,
# params=params,
data=data,
headers=hdrs,
)
scode = response.status_code
if scode != 200:
raise Exception(f"Something went wrong, received status code {scode}")
json = response.json()
ok = json.get("ok", False)
if not ok:
raise Exception(f"Something went wrong, did not get OK, json: {json}")
self.logger.debug(f"Response JSON: {json}")
resp = json.get("response", {})
entities = resp.get("entities", [])
sentences = resp.get("sentences", [])
categories = resp.get("categories", [])
topics = resp.get("topics", [])
entailments = resp.get("entailments", [])
relations = resp.get("relations", [])
properties = resp.get("properties", [])
nounphrases = resp.get("nounPhrases", [])
language = resp.get("language")
languageIsReliable = resp.get("languageIsReliable")
tok2off = {} # maps token idxs to tuples (start,end)
annset = doc.annset(self.out_annset)
for s in sentences:
sentstart = None
sentend = None
words = s.get("words", [])
end = None
for word in words:
start = word["startingPos"]
end = word["endingPos"]
if sentstart is None:
sentstart = start
tokidx = word["position"]
feats = {}
feats["partOfSpeech"] = word["partOfSpeech"]
feats["lemma"] = word["lemma"]
if word.get("stem"):
feats["stem"] = word["stem"]
annset.add(start, end, "Token", features=feats)
tok2off[tokidx] = (start, end)
if end is not None:
sentend = end
if sentstart is not None and sentend is not None:
annset.add(sentstart, sentend, "Sentence")
for ent in entities:
feats = {}
for fname in [
"wikiLink",
"entityEnglishId",
"wikidataId",
"relevanceScore",
"confidenceScore",
"type",
"freebaseId",
"entityId",
"freebaseTypes",
]:
if fname in ent:
feats[fname] = ent[fname]
annset.add(ent["startingPos"], ent["endingPos"], "Entity", feats)
return doc
class ElgTextAnnotator(Annotator):
"""
An annotator that sends text to one of the services registered with the European Language Grid
(https://live.european-language-grid.eu/) and uses the result to create annotations.
NOTE: This is maybe not properly implemented and not properly tested yet!
"""
def __init__(
self,
auth_token=None,
url=None,
out_annset="",
min_delay_ms=501,
anntypes_map=None,
):
"""
Create an ElgTextAnnotator.
Args:
auth_token: the authentication token from ELG
url: the annotation service URL to use
out_annset: the name of the annotation set where to create the annotations (default: "")
min_delay_ms: the minimum delay time between requests in milliseconds (default: 501 ms)
anntypes_map: a map for renaming the annotation type names from the service to the ones to use in
the annotated document.
"""
assert url is not None
self.auth_token = auth_token
self.url = url
self.min_delay_s = min_delay_ms / 1000.0
self.anntypes_map = anntypes_map
self.out_annset = out_annset
self.logger = init_logger(__name__)
# self.logger.setLevel(logging.DEBUG)
self._last_call_time = 0
def __call__(self, doc, **kwargs):
delay = time.time() - self._last_call_time
if delay < self.min_delay_s:
time.sleep(self.min_delay_s - delay)
om = OffsetMapper(doc.text)
request_json = json.dumps(
{"type": "text", "content": doc.text, "mimeType": "text/plain"}
)
hdrs = {"Content-Type": "application/json"}
if self.auth_token:
hdrs["Authorization"] = f"Bearer {self.auth_token}"
response = requests.post(self.url, data=request_json, headers=hdrs)
scode = response.status_code
if scode != 200:
raise Exception(
f"Something went wrong, received status code/text {scode} / {response.text}"
)
response_json = response.json()
# self.logger.debug(f"Response JSON: {json}")
# TODO: check that we have got
# - a map
# - which has the "response" key
# - response value is a map which has "type"= "annotations" and
# - "annotations" is a map with keys being the annotation types and values arrays of annoations
ents = response_json.get("response", {}).get("annotations", {})
annset = doc.annset(self.out_annset)
for ret_anntype, ret_anns in ents.items():
if self.anntypes_map:
anntype = self.anntypes_map.get(ret_anntype, ret_anntype)
else:
anntype = ret_anntype
for ret_ann in ret_anns:
start = ret_ann["start"]
end = ret_ann["end"]
feats = ret_ann.get("features", {})
start, end = om.convert_to_python([start, end])
annset.add(start, end, anntype, features=feats)
return doc
Classes
class ElgTextAnnotator (auth_token=None, url=None, out_annset='', min_delay_ms=501, anntypes_map=None)
-
An annotator that sends text to one of the services registered with the European Language Grid (https://live.european-language-grid.eu/) and uses the result to create annotations.
NOTE: This is maybe not properly implemented and not properly tested yet!
Create an ElgTextAnnotator.
Args
auth_token
- the authentication token from ELG
url
- the annotation service URL to use
out_annset
- the name of the annotation set where to create the annotations (default: "")
min_delay_ms
- the minimum delay time between requests in milliseconds (default: 501 ms)
anntypes_map
- a map for renaming the annotation type names from the service to the ones to use in the annotated document.
Expand source code
class ElgTextAnnotator(Annotator): """ An annotator that sends text to one of the services registered with the European Language Grid (https://live.european-language-grid.eu/) and uses the result to create annotations. NOTE: This is maybe not properly implemented and not properly tested yet! """ def __init__( self, auth_token=None, url=None, out_annset="", min_delay_ms=501, anntypes_map=None, ): """ Create an ElgTextAnnotator. Args: auth_token: the authentication token from ELG url: the annotation service URL to use out_annset: the name of the annotation set where to create the annotations (default: "") min_delay_ms: the minimum delay time between requests in milliseconds (default: 501 ms) anntypes_map: a map for renaming the annotation type names from the service to the ones to use in the annotated document. """ assert url is not None self.auth_token = auth_token self.url = url self.min_delay_s = min_delay_ms / 1000.0 self.anntypes_map = anntypes_map self.out_annset = out_annset self.logger = init_logger(__name__) # self.logger.setLevel(logging.DEBUG) self._last_call_time = 0 def __call__(self, doc, **kwargs): delay = time.time() - self._last_call_time if delay < self.min_delay_s: time.sleep(self.min_delay_s - delay) om = OffsetMapper(doc.text) request_json = json.dumps( {"type": "text", "content": doc.text, "mimeType": "text/plain"} ) hdrs = {"Content-Type": "application/json"} if self.auth_token: hdrs["Authorization"] = f"Bearer {self.auth_token}" response = requests.post(self.url, data=request_json, headers=hdrs) scode = response.status_code if scode != 200: raise Exception( f"Something went wrong, received status code/text {scode} / {response.text}" ) response_json = response.json() # self.logger.debug(f"Response JSON: {json}") # TODO: check that we have got # - a map # - which has the "response" key # - response value is a map which has "type"= "annotations" and # - "annotations" is a map with keys being the annotation types and values arrays of annoations ents = response_json.get("response", {}).get("annotations", {}) annset = doc.annset(self.out_annset) for ret_anntype, ret_anns in ents.items(): if self.anntypes_map: anntype = self.anntypes_map.get(ret_anntype, ret_anntype) else: anntype = ret_anntype for ret_ann in ret_anns: start = ret_ann["start"] end = ret_ann["end"] feats = ret_ann.get("features", {}) start, end = om.convert_to_python([start, end]) annset.add(start, end, anntype, features=feats) return doc
Ancestors
- Annotator
- abc.ABC
Inherited members
class GateCloudAnnotator (api_key=None, api_password=None, url=None, ann_types=None, map_types=None, out_annset='', min_delay_ms=501)
-
This annotator sends the text of a document to a GATE Cloud (https://cloud.gate.ac.uk/) endpoint and uses the returned result to create annotations.
Create a GateCloudAnnotator.
Args
api_key
- API key needed to authenticate. Some services can be used in a limited way without authentication.
api_password
- API password needed to authenticale.
url
- the URL of the annotation service endpoint, shown on the GATE Cloud page for the service
ann_types
- this can be used to let the service annotate fewer or more than the default list of annotation types. The default list and all possible annotations are shown on the GATE Cloud page for the service. Either a string with comma separated annotation types preceded by a colon (e.g. ":Person,:Location") or a python list with those type names (e.g. [":Person", ":Location"]). If the list contains type names without a leading colon, the colon is added.
map_types
- a dict which maps the annotation types from the service to arbitrary new annotation types, any type name not in the map will remain unchanged.
out_annset
- the annotation set in which to store the annotations
min_delay_ms
- minimum time in milliseconds between two subsequent requests to the server
Expand source code
class GateCloudAnnotator(Annotator): """ This annotator sends the text of a document to a GATE Cloud (https://cloud.gate.ac.uk/) endpoint and uses the returned result to create annotations. """ def __init__( self, api_key=None, api_password=None, url=None, ann_types=None, map_types=None, out_annset="", min_delay_ms=501, ): """ Create a GateCloudAnnotator. Args: api_key: API key needed to authenticate. Some services can be used in a limited way without authentication. api_password: API password needed to authenticale. url: the URL of the annotation service endpoint, shown on the GATE Cloud page for the service ann_types: this can be used to let the service annotate fewer or more than the default list of annotation types. The default list and all possible annotations are shown on the GATE Cloud page for the service. Either a string with comma separated annotation types preceded by a colon (e.g. ":Person,:Location") or a python list with those type names (e.g. [":Person", ":Location"]). If the list contains type names without a leading colon, the colon is added. map_types: a dict which maps the annotation types from the service to arbitrary new annotation types, any type name not in the map will remain unchanged. out_annset: the annotation set in which to store the annotations min_delay_ms: minimum time in milliseconds between two subsequent requests to the server """ self.api_key = api_key self.api_password = api_password self.url = url self.map_types = map_types self.min_delay_s = min_delay_ms / 1000.0 self.out_annset = out_annset if ann_types: if isinstance(ann_types, str): self.ann_types = ann_types elif isinstance(ann_types, list): self.ann_types = ",".join( [at if at.startswith(":") else ":" + at for at in ann_types] ) else: raise Exception( "ann_types mist be a string of types like ':Person,:Location' or a list of types" ) else: self.ann_types = None self.logger = init_logger() self.logger.setLevel(logging.DEBUG) self._last_call_time = 0 def __call__(self, doc, **kwargs): delay = time.time() - self._last_call_time if delay < self.min_delay_s: time.sleep(self.min_delay_s - delay) if "url" in kwargs: url = kwargs["url"] else: url = self.url text = doc.text hdrs = { "Content-Type": "text/plain; charset=UTF-8", "Accept": "application/gate+json", } params = {} if self.ann_types: params["annotations"] = self.ann_types # NOTE: not sure when this is needed, for now, disabled # next_annid = doc.annset(self.out_annset)._next_annid # params["nextAnnotationId"] = str(next_annid) # self.logger.debug(f"Sending text={text}, params={params}") if self.api_key: response = requests.post( url, data=text.encode("utf-8"), headers=hdrs, params=params, auth=HTTPBasicAuth(self.api_key, self.api_password), ) else: response = requests.post( url, data=text.encode("utf-8"), headers=hdrs, params=params ) scode = response.status_code if scode != 200: raise Exception(f"Something went wrong, received status code {scode}") json = response.json() ents = json.get("entities", {}) annset = doc.annset(self.out_annset) for typename, anns in ents.items(): for anndata in anns: feats = {} start, end = ( None, None, ) # cause an exception if the return data does not have indices for fname, fval in anndata.items(): if fname == "indices": start, end = fval[0], fval[1] else: feats[fname] = fval if self.map_types: typename = self.map_types.get(typename, typename) # self.logger.debug(f"Adding annotation {start},{start},{typename},{feats}") annset.add(start, end, typename, features=feats) return doc
Ancestors
- Annotator
- abc.ABC
Inherited members
class TagMeAnnotator (lang='en', ann_type='Mention', auth_token=None, url=None, task='tag', out_annset='', min_delay_ms=501, tweet=False, include_all_spots=False, long_text=None, epsilon=None, link_pattern='https://{0}.wikipedia.org/wiki/{1}')
-
An annotator that sends text to the TagMe Annotation service (https://sobigdata.d4science.org/group/tagme/tagme) and uses the result to annotate the document.
Create a TagMeAnnotator.
Args
lang
- the language of the text, one of 'de', 'en' (default), 'it'
ann_type
- the annotation type for the new annotations, default is "Mention"
auth_token
- the authentication token needed to use the service
url
- the annotation service endpoint, is None, the default endpoint for the task (spot or tag) is used
task
- one of "spot" (only find mentions) or "tag" (find mentions and link), default is "tag"
out_annset
- the annotationset to put the new annotations in
min_delay_ms
- minimum time in ms to wait between requests to the server
tweet
- if True, TagMe expects a Tweet (default is False)
include_all_spots
- if True, include spots that cannot be linked (default is False)
long_text
- if not None, the context length to use (default: None)
epsilon
- if not None, the epsilong value (float) to use (default: None)
link_pattern
- the URL pattern to use to turn the "title" returned from TagMe into an actual link. The default is "https://{0}.wikipedia.org/wiki/{1}" where {0} gets replaced with the language code and {1} gets replaced with the title.
Expand source code
class TagMeAnnotator(Annotator): """ An annotator that sends text to the TagMe Annotation service (https://sobigdata.d4science.org/group/tagme/tagme) and uses the result to annotate the document. """ def __init__( self, lang="en", ann_type="Mention", auth_token=None, url=None, task="tag", # or spot out_annset="", min_delay_ms=501, tweet=False, include_all_spots=False, long_text=None, epsilon=None, link_pattern="https://{0}.wikipedia.org/wiki/{1}", ): """ Create a TagMeAnnotator. Args: lang: the language of the text, one of 'de', 'en' (default), 'it' ann_type: the annotation type for the new annotations, default is "Mention" auth_token: the authentication token needed to use the service url: the annotation service endpoint, is None, the default endpoint for the task (spot or tag) is used task: one of "spot" (only find mentions) or "tag" (find mentions and link), default is "tag" out_annset: the annotationset to put the new annotations in min_delay_ms: minimum time in ms to wait between requests to the server tweet: if True, TagMe expects a Tweet (default is False) include_all_spots: if True, include spots that cannot be linked (default is False) long_text: if not None, the context length to use (default: None) epsilon: if not None, the epsilong value (float) to use (default: None) link_pattern: the URL pattern to use to turn the "title" returned from TagMe into an actual link. The default is "https://{0}.wikipedia.org/wiki/{1}" where {0} gets replaced with the language code and {1} gets replaced with the title. """ if url is None: if task == "tag": url = "https://tagme.d4science.org/tagme/tag" elif task == "spot": url = "https://tagme.d4science.org/tagme/spot" else: raise Exception("task must be 'tag' or 'spot'") assert lang in ["en", "de", "it"] if long_text is not None: assert isinstance(long_text, int) if epsilon is not None: assert isinstance(epsilon, float) self.long_text = long_text self.epsilon = epsilon self.lang = lang self.auth_token = auth_token self.url = url self.tweet = tweet self.include_all_spots = include_all_spots self.out_annset = out_annset self.min_delay_s = min_delay_ms / 1000.0 self.logger = init_logger() # self.logger.setLevel(logging.DEBUG) self._last_call_time = 0 self.ann_type = ann_type self.link_pattern = link_pattern def __call__(self, doc, **kwargs): if "tweet" in kwargs: tweet = kwargs["tweet"] else: tweet = self.tweet delay = time.time() - self._last_call_time if delay < self.min_delay_s: time.sleep(self.min_delay_s - delay) text = doc.text hdrs = { "Content-Type": "text/plain; charset=UTF-8", "Accept": "application/gate+json", } params = { "text": text, "gcube-token": self.auth_token, "lang": self.lang, } if self.include_all_spots: params["include_all_spots"] = "true" if tweet: params["tweet"] = "true" if self.long_text is not None: params["long_text"] = self.long_text if self.epsilon is not None: params["epsilon"] = self.epsilon response = requests.post(self.url, params=params, headers=hdrs) scode = response.status_code if scode != 200: raise Exception(f"Something went wrong, received status code {scode}") json = response.json() # self.logger.debug(f"Response JSON: {json}") ents = json.get("annotations", {}) annset = doc.annset(self.out_annset) om = OffsetMapper(text) for ent in ents: start = ent["start"] end = ent["end"] start, end = om.convert_to_python([start, end]) feats = {} title = ent.get("title") if title is not None: if self.link_pattern: feats["url"] = self.link_pattern.format(self.lang, title) else: feats["title"] = title for fname in ["id", "rho", "link_probability", "lp"]: fval = ent.get(fname) if fval is not None: feats[fname] = fval # self.logger.debug(f"Adding annotation {start},{end},{feats}") annset.add(start, end, self.ann_type, features=feats) return doc
Ancestors
- Annotator
- abc.ABC
Inherited members
class TextRazorTextAnnotator (lang=None, auth_token=None, url=None, extractors=None, out_annset='', min_delay_ms=501)
-
An annotator that sends document text to the TextRazor Annotation service (https://www.textrazor.com/) and uses the result to annotate the document.
NOTE: this annotator and how it can get parametrized will still change!
Create a TextRazorTextAnnotator.
Args
lang
- if specified, override the auto-detected language of the text
auth_token
- the authentication token needed to use the service
url
- the annotation service endpoint, is None, the default endpoint https://api.textrazor.com is used
extractors
- a list of extractor names or a string with comma-separated extractor names to add to the minimum extractors (words, sentences). If None uses words, sentences, entities. NOTE: currently only words, sentences, entities is supported.!
out_annset
- the annotationset to put the new annotations in
min_delay_ms
- minimum time in ms to wait between requests to the server
Expand source code
class TextRazorTextAnnotator(Annotator): """ An annotator that sends document text to the TextRazor Annotation service (https://www.textrazor.com/) and uses the result to annotate the document. NOTE: this annotator and how it can get parametrized will still change! """ def __init__( self, lang=None, # if None/not specified, TextRazor auto-detects auth_token=None, url=None, # use default extractors=None, out_annset="", min_delay_ms=501, ): """ Create a TextRazorTextAnnotator. Args: lang: if specified, override the auto-detected language of the text auth_token: the authentication token needed to use the service url: the annotation service endpoint, is None, the default endpoint https://api.textrazor.com is used extractors: a list of extractor names or a string with comma-separated extractor names to add to the minimum extractors (words, sentences). If None uses words, sentences, entities. NOTE: currently only words, sentences, entities is supported.! out_annset: the annotationset to put the new annotations in min_delay_ms: minimum time in ms to wait between requests to the server """ if url is None: url = "https://api.textrazor.com" self.url = url self.lang = lang self.out_annset = out_annset self.auth_token = auth_token self.min_delay_s = min_delay_ms / 1000.0 self.logger = init_logger() self.logger.setLevel(logging.DEBUG) self._last_call_time = 0 if extractors is not None: if isinstance(extractors, str): extractors = extractors.split(",") if isinstance(extractors, list): allextrs = set() allextrs.update(extractors) allextrs.update(["words", "sentences"]) self.extractors = ",".join(list(allextrs)) else: raise Exception("Odd extractors, must be list of strings or string") else: self.extractors = "words,sentences,entities" def __call__(self, doc, **kwargs): delay = time.time() - self._last_call_time if delay < self.min_delay_s: time.sleep(self.min_delay_s - delay) text = doc.text hdrs = { # 'Content-Type': 'text/plain; charset=UTF-8', # 'Accept-encoding': 'gzip' # TODO: to enable compressed responses # 'Content-encoding': 'gzip' # TODO: to enable compressed requests "X-TextRazor-Key": self.auth_token } data = {"text": text.encode("UTF-8")} if self.extractors: data["extractors"] = self.extractors if self.lang: data["languageOverride"] = self.lang self.logger.debug(f"Sending request to {self.url}, data={data}, headers={hdrs}") response = requests.post( self.url, # params=params, data=data, headers=hdrs, ) scode = response.status_code if scode != 200: raise Exception(f"Something went wrong, received status code {scode}") json = response.json() ok = json.get("ok", False) if not ok: raise Exception(f"Something went wrong, did not get OK, json: {json}") self.logger.debug(f"Response JSON: {json}") resp = json.get("response", {}) entities = resp.get("entities", []) sentences = resp.get("sentences", []) categories = resp.get("categories", []) topics = resp.get("topics", []) entailments = resp.get("entailments", []) relations = resp.get("relations", []) properties = resp.get("properties", []) nounphrases = resp.get("nounPhrases", []) language = resp.get("language") languageIsReliable = resp.get("languageIsReliable") tok2off = {} # maps token idxs to tuples (start,end) annset = doc.annset(self.out_annset) for s in sentences: sentstart = None sentend = None words = s.get("words", []) end = None for word in words: start = word["startingPos"] end = word["endingPos"] if sentstart is None: sentstart = start tokidx = word["position"] feats = {} feats["partOfSpeech"] = word["partOfSpeech"] feats["lemma"] = word["lemma"] if word.get("stem"): feats["stem"] = word["stem"] annset.add(start, end, "Token", features=feats) tok2off[tokidx] = (start, end) if end is not None: sentend = end if sentstart is not None and sentend is not None: annset.add(sentstart, sentend, "Sentence") for ent in entities: feats = {} for fname in [ "wikiLink", "entityEnglishId", "wikidataId", "relevanceScore", "confidenceScore", "type", "freebaseId", "entityId", "freebaseTypes", ]: if fname in ent: feats[fname] = ent[fname] annset.add(ent["startingPos"], ent["endingPos"], "Entity", feats) return doc
Ancestors
- Annotator
- abc.ABC
Inherited members