#!/usr/bin/env python
"""
Support for interacting between a GATE (java) process and a gatenlp (Python) process.
"""
# TODO: add a __main__ section that can be used to show help about usage?
import sys
import os
import io
import traceback
import gatenlp
from argparse import ArgumentParser
import inspect
import logging
from gatenlp.changelog import ChangeLog
from gatenlp.document import Document
from gatenlp.offsetmapper import OFFSET_TYPE_JAVA, OFFSET_TYPE_PYTHON
from gatenlp import logger
import json
# We cannot simply do this, because on some systems Python may guess the wrong encoding for stdin:
# instream = sys.stdin
# Instead use utf-8 explicitly:
instream = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8")
ostream = sys.stdout
sys.stdout = sys.stderr
class _PrWrapper:
def __init__(self):
self.func_execute = None # the function to process each doc
self.func_execute_allowkws = False
self.func_start = None # called when processing starts
self.func_start_allowkws = False
self.func_finish = None # called when processing finishes
self.func_finish_allowkws = False
self.func_reduce = None # function for combining results
self.func_reduce_allowkws = False
self.script_parms = {} # Script parms to pass to each execute
self.logger = None
def execute(self, doc):
if self.func_execute_allowkws and self.script_parms:
ret = self.func_execute(doc, **self.script_parms)
else:
ret = self.func_execute(doc)
if ret is None:
if doc.changelog is None:
ret = doc
else:
ret = doc.changelog
return ret
def start(self, script_params):
if script_params:
self.script_parms = script_params
# TODO: amend the script params with additional data from here?
if self.func_start is not None:
if self.func_start_allowkws and self.script_parms:
self.func_start(**self.script_parms)
else:
self.func_start()
def finish(self):
if self.func_finish is not None:
if self.func_finish_allowkws and self.script_parms:
return self.func_finish(**self.script_parms)
else:
return self.func_finish()
def reduce(self, resultslist):
if self.func_reduce is not None:
if self.func_reduce_allowkws and self.script_parms:
ret = self.func_reduce(resultslist, **self.script_parms)
else:
ret = self.func_reduce(resultslist)
return ret
def _check_exec(func):
"""
Check the signature of the func to see if it is a proper
execute function: must accept one (or more optional) args
and can accept kwargs. This returns true of kwargs are accepted
:param func: the function to check
:return: true if the function accepts kwargs
"""
argspec = inspect.getfullargspec(func)
if len(argspec.args) == 1 \
or len(argspec.args) == 2 and argspec.args[0] == "self" \
or argspec.varargs is not None:
pass
else:
raise Exception("Processing resource execution function does not accept exactly one or any number of arguments")
if argspec.varkw is not None:
return True
else:
return False
def _has_method(theobj, name):
"""
Check if the object has a callable method with the given name,
if yes return the method, otherwise return None
:param theobj: the object that contains the method
:param name: the name of the method
:return: the method or None
"""
tmp = getattr(theobj, name, None)
if tmp is not None and callable(tmp):
return tmp
else:
return None
def _pr_decorator(what):
"""
This is the decorator to identify a class or function as a processing
resource. This is made available with the name PR in the gatenlp
package.
This creates an instance of PRWrapper and registers all the relevant
functions of the decorated class or the decorated function in the
wrapper.
:param what: the class or function to decorate.
:return: modified class or function
"""
gatenlp.gate_python_plugin_pr = "The PR from here!!!"
wrapper = _PrWrapper()
if inspect.isclass(what):
what = what() # create an instance
# TODO: instead of this we could just as well store the instance and
# directly call the instance methods from the wrapper!
execmethod = _has_method(what, "__call__")
if not execmethod:
raise Exception("PR does not have a __call__(doc) method.")
allowkws = _check_exec(execmethod)
wrapper.func_execute_allowkws = allowkws
wrapper.func_execute = execmethod
startmethod = _has_method(what, "start")
if startmethod:
wrapper.func_start = startmethod
if inspect.getfullargspec(startmethod).varkw:
wrapper.func_start_allowkws = True
finishmethod = _has_method(what, "finish")
if finishmethod:
wrapper.func_finish = finishmethod
if inspect.getfullargspec(finishmethod).varkw:
wrapper.func_finish_allowkws = True
reducemethod = _has_method(what, "reduce")
if reducemethod:
wrapper.func_reduce = reducemethod
if inspect.getfullargspec(reducemethod).varkw:
wrapper.func_reduce_allowkws = True
elif inspect.isfunction(what):
allowkws = _check_exec(what)
wrapper.func_execute = what
wrapper.func_execute_allowkws = allowkws
else:
raise Exception(f"Decorator applied to something that is not a function or class: {what}")
gatenlp.gate_python_plugin_pr = wrapper
return wrapper
[docs]class DefaultPr:
def __call__(self, doc, **kwargs):
logger.debug("DefaultPr: called __call__() with doc={}, kwargs={}".format(doc, kwargs))
return doc
[docs] def start(self, **kwargs):
logger.debug("DefaultPr: called start() with kwargs={}".format(kwargs))
logger.warning("Running DefaultPr: did you define a @GateNlpPr class or function?")
return None
[docs] def finish(self, **kwargs):
logger.debug("DefaultPr: called finish() with kwargs={}".format(kwargs))
logger.warning("Finished DefaultPr: did you define a @GateNlpPr class or function?")
return None
[docs] def reduce(self, resultlist, **kwargs):
logger.debug("DefaultPr: called reduce() with results {} and kwargs={}".format(
resultlist, kwargs))
return None
[docs]def get_arguments(from_main=False):
argparser = ArgumentParser()
argparser.add_argument("--mode", default="check",
help="Interaction mode: pipe|http|websockets|file|dir|check (default: check)")
argparser.add_argument("--format", default="json",
help="Exchange format: json|json.gz|cjson")
argparser.add_argument("--path", help="File/directory path for modes file/dir")
argparser.add_argument("--out", help="Output file/directory path for modes file/dir")
argparser.add_argument("-d", action="store_true",
help="Enable debugging: log to stderr")
argparser.add_argument("--log_lvl", type=str, help="Log level to use: DEBUG|INFO|WARNING|ERROR|CRITICAL")
if(from_main):
argparser.add_argument("pythonfile")
args = argparser.parse_args()
return args
[docs]def interact(args=None):
"""
Starts and handles the interaction with a GATE python plugin process.
This will get started by the GATE plugin if the interaction uses
pipes, but can also be started separately for http/websockets.
This MUST be called in the user's python file!
The python file should also have one class or function decorated
with the @gatenlp.PR decorator to identify it as the
processing resource to the system.
:return:
"""
loglvls = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL
}
# before we do anything we need to check if a PR has actually
# been defined. If not, use our own default debugging PR
if gatenlp.gate_python_plugin_pr is None:
logger.warning("No processing resource defined with @GateNlpPr decorator, using default do-nothing")
_pr_decorator(DefaultPr)
pr = gatenlp.gate_python_plugin_pr
if args is None:
args = get_arguments()
if args.d:
logger.setLevel(logging.DEBUG)
if args.log_lvl:
if args.log_lvl not in loglvls:
raise Exception("Not a valid log level: {}".format(args.log_lvl))
logger.setLevel(loglvls[args.log_lvl])
if args.mode == "check":
return
logger.info("Using gatenlp version {}".format(gatenlp.__version__))
logger.debug("Starting interaction args={}".format(args))
if args.mode == "pipe":
if args.format != "json":
raise Exception("For interaction mode pipe, only format=json is supported")
for line in instream:
try:
request = json.loads(line)
except Exception as ex:
logger.error("Unable to load from JSON:\n{}".format(line))
raise ex
logger.debug("Got request object: {}".format(request))
cmd = request.get("command", None)
stop_requested = False
ret = None
try:
if cmd == "execute":
doc = Document.from_dict(request.get("data"))
om = doc.to_type(OFFSET_TYPE_PYTHON)
doc.set_changelog(ChangeLog())
pr.execute(doc)
# NOTE: for now we just discard what the method returns and always return
# the changelog instead!
chlog = doc.changelog
# if we got an offset mapper earlier, we had to convert, so we convert back to JAVA
if om:
# replace True is faster, and we do not need the ChangeLog any more!
chlog.fixup_changes(offset_mapper=om, offset_type=OFFSET_TYPE_JAVA, replace=True)
ret = doc.changelog.to_dict()
logger.debug("Returning CHANGELOG: {}".format(ret))
elif cmd == "start":
parms = request.get("data")
pr.start(parms)
elif cmd == "finish":
ret = pr.finish()
elif cmd == "reduce":
results = request.get("data")
ret = pr.reduce(results)
elif cmd == "stop":
stop_requested = True
else:
raise Exception("Odd command received: {}".format(cmd))
response = {
"data": ret,
"status": "ok",
}
except Exception as ex:
error = repr(ex)
tb_str = traceback.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__)
print("ERROR when running python code:", file=sys.stderr)
for line in tb_str:
print(line, file=sys.stderr, end="") # what we get from traceback already has new lines
info = "".join(tb_str)
# in case we want the actual stacktrace data as well:
st = [(f.filename, f.lineno, f.name, f.line) for f in traceback.extract_tb(ex.__traceback__)]
response = {
"data": None,
"status": "error",
"error": error,
"info": info,
"stacktrace": st
}
logger.debug("Sending back response: {}".format(response))
print(json.dumps(response), file=ostream)
ostream.flush()
if stop_requested:
break
# TODO: do any cleanup/restoring needed
logger.debug("Finishing interaction")
elif args.mode == "http":
raise Exception("Mode http not implemented yet")
elif args.mode == "websockets":
raise Exception("Mode websockets not implemented yet")
elif args.mode in ["file", "dir"]:
if not args.path:
raise Exception("Mode file or dir but no --path specified")
fileext = ".bdoc" + args.format
if args.mode == "file" and not os.path.isfile(args.path):
raise Exception("Mode file but path is not a file: {}".format(args.path))
elif args.mode == "dir" and not os.path.isdir(args.path):
raise Exception("Mode dir but path is not a directory: {}".format(args.path))
if args.mode == "file":
pr.start({})
logger.info(f"Loading file {args.path}")
doc = Document.load(args.path)
pr.execute(doc)
pr.finish()
if args.out:
logger.info(f"Saving file to {args.out}")
doc.save(args.out)
else:
logger.info(f"Saving file to {args.path}")
doc.save(args.path)
else:
import glob
pr.start({})
files = glob.glob(args.path+os.path.sep+"*"+fileext)
for file in files:
logger.info("Loading file {}".format(file))
doc = Document.load(file)
pr.execute(doc)
if args.out:
tofile = os.path.join(args.out, os.path.basename(file))
logger.info("Saving to {}".format(tofile))
doc.save(tofile)
else:
logger.info("Saving to {}".format(file))
doc.save(file)
pr.finish()
else:
raise Exception("Not a valid mode: {}".format(args.mode))
if __name__ == "__main__":
# we run this from the command line so we need to also first load the PR code from the python file
args = get_arguments(from_main=True)
import importlib.util
spec = importlib.util.spec_from_file_location("gateapp", args.pythonfile)
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
interact(args=args)