Source code for gatenlp.gateslave

#!/usr/bin/env python
"""
Module for interacting with a Java GATE process, running API commands on it and
exchanging data with it.
"""

import sys
import subprocess
import os
import platform as sysplatform
import logging
import atexit
# NOTE: we delay imporint py4j to the class initializer. This allows us to make GateSlave available via gatenlp
# but does not force everyone to actually have py4j installed if they do not use the GateSlave
# from py4j.java_gateway import JavaGateway, GatewayParameters
from gatenlp import Document

JARVERSION = "1.0"

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


[docs]def classpath_sep(platform=None): """ Return the classpath separator character for the current operating system / platform. :return: classpath separator character """ if not platform: myplatform = sysplatform.system() if not myplatform: raise Exception("Could not determine operating system, please use platform parameter") platform = myplatform if platform.lower() == "windows": return ";" else: return ":"
[docs]def gate_classpath(gatehome, platform=None): """ Return the GATE classpath components as a string, with the element seperator characters appropriate for the operating system. :param gatehome: where GATE is installed, either as a cloned git repo or a downloaded installation dir. :return: GATE classpath """ # check which kind of GATE home we have: if there is a distro subdirectory, assume cloned git repo if not os.path.exists(gatehome): raise Exception("GATE home directory does not exist: {}".format(gatehome)) if not os.path.isdir(gatehome): raise Exception("GATE home directory does not a directory: {}".format(gatehome)) cpsep = classpath_sep(platform) cpfile = os.path.join(gatehome, "gate.classpath") bindir = os.path.join(gatehome, "bin") # logger.info("DEBUG checking for {}".format(cpfile)) if os.path.exists(cpfile): if not os.path.exists(cpfile): raise Exception("File not found {}, distribution may need compiling".format(cpfile)) with open(cpfile, "rt", encoding="utf-8") as fp: cp = fp.read() return cp + cpsep + bindir else: # logger.info("DEBUG {} does not exist".format(cpfile)) libdir = os.path.join(gatehome, "lib") bindir = os.path.join(gatehome, "bin") if not os.path.isdir(libdir): raise Exception("Could not determine class path from {}, no lib directory".format(gatehome)) # jars = glob.glob(os.path.join(libdir,"*.jar")) # return cpsep.join(jars) return libdir + cpsep + bindir
[docs]class GateSlave: def __init__(self, port=25333, start=True, java="java", host="127.0.0.1", gatehome=None, platform=None): """ Create an instance of the GateSlave and either start our own Java GATE process for it to use (start=True) or connect to an existing one (start=False). After the GateSlave instance has been create successfully, it is possible to: * Use one of the methods of the instance to perform operations on the Java side or exchange data * use GateSlave.slave to invoke methods from the PythonSlave class on the Java side * use GateSlave.jvm to directly construct objects or call instance or static methods NOTE: the GATE process must not output anything important/big to stderr because everything from stderr gets captured and used for communication between the Java and Python processes. At least part of the output to stderr may only be passed on after the GATE process has ended. Example: :: gs = GateSlave() pipeline = gs.slave.loadPipelineFromFile("thePipeline.xgapp") doc = gs.slave.createDocument("Some document text") gs.slave.run4doc(pipeline,doc) pdoc = gs.gdoc2pdoc(doc) gs.slave.deleteResource(doc) # process the gatenlp Document pdoc ... :param port: port to use :param java: path to the java binary to run or the java command to use from the PATH (for start=True) :param host: host an existing Java GATE process is running on (only relevant for start=False) :param gatehome: where GATE is installed (only relevant if start=True). If None, expects environment variable GATE_HOME to be set. :param platform: system platform we run on, one of Windows, Linux (also for MacOs) or Java """ from py4j.java_gateway import JavaGateway, GatewayParameters self.gatehome = gatehome self.port = port self.host = host self.start = start self.gatehome = gatehome self.platform = platform self.gateprocess = None self.gateway = None self.slave = None self.closed = False if gatehome is None and start: gatehome = os.environ.get("GATE_HOME") if gatehome is None: raise Exception("Parameter gatehome is None and environment var GATE_HOME not set") self.gatehome = gatehome if start: # make sure we find the jar we need # logger.info("DEBUG: file location: {}".format(__file__)) jarloc = os.path.join(os.path.dirname(__file__), "_jars", f"gatetools-gatenlpslave-{JARVERSION}.jar") if not os.path.exists(jarloc): raise Exception("Could not find jar, {} does not exist".format(jarloc)) cmdandparms = [] cmdandparms.append(java) cmdandparms.append("-cp") cpsep = classpath_sep(platform=platform) cmdandparms.append(jarloc + cpsep + gate_classpath(self.gatehome, platform=platform)) cmdandparms.append("gate.tools.gatenlpslave.GatenlpSlave") cmdandparms.append(str(port)) cmdandparms.append(host) subproc = subprocess.Popen(cmdandparms, stderr=subprocess.PIPE, bufsize=0, encoding="utf-8") self.gateprocess = subproc while True: line = subproc.stderr.readline().strip() if line == "PYTHON SLAVE SERVER OK": break if line == "PYTHON SLAVE SERVER NOT OK": raise Exception("Could not start server, giving up") print(line, file=sys.stderr) atexit.register(self.close) self.gateway = JavaGateway(gateway_parameters=GatewayParameters(port=port)) self.jvm = self.gateway.jvm self.slave = self.gateway.entry_point
[docs] def close(self): """ Clean up: if the gate slave process was started by us, we will shut it down. :return: """ if self.start and not self.closed: self.closed = True self.gateway.shutdown() for line in self.gateprocess.stderr: print(line, file=sys.stderr, end="") self.gateprocess.wait()
[docs] def load_gdoc(self, path, mimetype=None): """ Let GATE load a document from the given path and return a handle to it. :param path: path to the gate document to load. :param mimetype: a mimetype to use when loading. :return: a handle to the GATE document """ if mimetype is None: mimetype = "" return self.slave.loadDocumentFromFile(path, mimetype)
[docs] def save_gdoc(self, gdoc, path, mimetype=None): """ Save GATE document to the given path. :param gdoc: GATE document handle :param path: destination path :param mimetype: mimtetype, only the following types are allowed: ""/None: GATE XML, application/fastinfoset, and all mimetypes supported by the Format_Bdoc plugin. :return: """ if mimetype is None: mimetype = "" self.slave.saveDocumentToFile(path, mimetype)
[docs] def gdoc2pdoc(self, gdoc): """ Convert the GATE document to a python document and return it. :param gdoc: the handle to a GATE document :return: a gatenlp Document instance """ bjs = self.slave.getBdocJson(gdoc) return Document.load_string(bjs)
[docs] def pdoc2gdoc(self, pdoc): """ Convert the Python gatenlp document to a GATE document and return a handle to it. :param pdoc: python gatenlp Document :return: handle to GATE document """ json = pdoc.save_string() return self.slave.getDocument4BdocJson(json)
[docs] def load_pdoc(self, path, mimetype=None): """ Load a document from the given path, using GATE and convert and return as gatenlp Python document. :param path: path to load document from :param mimetype: mime type to use :return: gatenlp document """ gdoc = self.load_gdoc(path, mimetype) return self.gdoc2pdoc(gdoc)
[docs] def del_gdoc(self, gdoc): """ Delete/unload the GATE document from GATE. This is necessary to do for each GATE document that is not used anymore, otherwise the documents will accumulate in the Java process and eat up all memory. NOTE: just removing all references to the GATE document does not delete/unload the document! :param gdoc: the document to remove :return: """ self.jvm.gate.Factory.deleteResource(gdoc)
[docs] def show_gui(self): """ Show the GUI for the started GATE process. NOTE: this is more of a hack and may cause sync problems when closing down the GATE slave. :return: """ self.slave.showGui()