This tutorial covers:
Aims:
conda create -n gatenlp python=3.9
conda activate gatenlp
To install most recent release and install all dependencies:
pip install gatenlp[all]
Also install support for jupyter notebookd and for showing the slides:
pip install jupyter notebook ipython ipykernel RISE
Create kernel for the conda environment:
python -m ipykernel install --user --name gatenlp --display-name "Python-gatenlp"
GATE_HOME
)jupyter notebook
, choose New -> Python-gatenlpjupyter notebook module11-python.ipynb
)ipython
and interactively type and run commands)If kernel error in Jupyter, try something like (Anaconda bug, apparently):
python C:\Users\USERNAME\miniconda3\envs\gatenlp\Scripts\pywin32_postinstall.py -install
from gatenlp import Document
Create a document from some text/string:
doc1 = Document("This is a small test document")
Print the document:
print(doc1)
Document(This is a small test document,features=Features({}),anns=[])
In a notebook, documents are visualized using the html-viewer when a document is the last value of a cell or when display(doc1)
is used:
# from IPython.display import display
doc1
display(doc1)
Document.load(some_location, ...)
fmt
parameterdoc2 = Document.load('./data/document-testing.txt')
doc2
thedocument.save(location, ...)
fmt
parameterdoc1.save("myfirstdocument.bdocjs")
with open("myfirstdocument.bdocjs", "rt", encoding="utf-8") as infp:
print(infp.read())
{"annotation_sets": {}, "text": "This is a small test document", "features": {}, "offset_type": "p", "name": ""}
doc1.save("myfirstdocument.bdocym") # use YAML serialization
with open("myfirstdocument.bdocym", "rt", encoding="utf-8") as infp:
print(infp.read())
annotation_sets: {} features: {} name: '' offset_type: p text: This is a small test document
# Can also "save" to memory/string, here the format is needed!
doc1.save_mem(fmt="bdocjs")
'{"annotation_sets": {}, "text": "This is a small test document", "features": {}, "offset_type": "p", "name": ""}'
import datetime
doc1.features["loading_date"] = str(datetime.datetime.now())
doc1.features["purpose"] = "Testing gatenlp."
doc1.features["numeric_value"] = 22
doc1.features["dict_of_objects"] = {"dict_key": "dict_value", "a_list": [1,2,3,4,5]}
doc1.features["_tmp1"] = "some value"
doc1.features["__tmp2"] = 12345
doc1
print("1:", doc1.features["purpose"])
print("2:", doc1.features.get("doesntexist"))
print("3:", doc1.features.get("doesntexist", "NA!"))
1: Testing gatenlp. 2: None 3: NA!
for name, value in doc1.features.items():
print(f"{name}: {value}")
loading_date: 2021-02-06 19:46:56.831158 purpose: Testing gatenlp. numeric_value: 22 dict_of_objects: {'dict_key': 'dict_value', 'a_list': [1, 2, 3, 4, 5]} _tmp1: some value __tmp2: 12345
Lets check how the document with features is serialized to "bdocjs" (JSON) format:
import pprint, json
js_str = doc1.save_mem(fmt="bdocjs")
js = json.loads(js_str)
pprint.pprint(js)
{'annotation_sets': {}, 'features': {'_tmp1': 'some value', 'dict_of_objects': {'a_list': [1, 2, 3, 4, 5], 'dict_key': 'dict_value'}, 'loading_date': '2021-02-06 19:46:56.831158', 'numeric_value': 22, 'purpose': 'Testing gatenlp.'}, 'name': '', 'offset_type': 'p', 'text': 'This is a small test document'}
add
method of the set# create and get an annotation set with the name "Set1"
annset = doc1.annset("Set1")
#Now, add an annotation, this method returns the newly created annotation
annset.add(0,4,"AnnType1")
Annotation(0,4,AnnType1,features=Features({}),id=0)
annset.add(0, 4, "Token", {"id": "token1'"})
annset.add(5, 7, "Token", {"id": "token2'"})
annset.add(8, 9, "Token", {"id": "token3'"})
annset.add(10, 15, "Token", {"id": "token4'"})
annset.add(16, 20, "Token", {"id": "token5"})
annset.add(21, 29, "Token", {"id": "token6"})
annset.add(0, 29, "Sentence", {"what": "The first 'sentence' annotation"});
for ann in annset:
print(ann)
Annotation(0,4,AnnType1,features=Features({}),id=0) Annotation(0,4,Token,features=Features({'id': "token1'"}),id=1) Annotation(0,29,Sentence,features=Features({'what': "The first 'sentence' annotation"}),id=7) Annotation(5,7,Token,features=Features({'id': "token2'"}),id=2) Annotation(8,9,Token,features=Features({'id': "token3'"}),id=3) Annotation(10,15,Token,features=Features({'id': "token4'"}),id=4) Annotation(16,20,Token,features=Features({'id': 'token5'}),id=5) Annotation(21,29,Token,features=Features({'id': 'token6'}),id=6)
doc1
doc3 = Document.load("data/ann-relations.bdocjs")
doc3.show(htmlid="view1")
# make a variable for each annotation type
for anntype in list(doc3.annset("set1").type_names):
vars()[anntype.lower()] = doc3.annset("set1").with_type(anntype).for_idx(0)
print("Ann2 isoverlapping Ann1:", ann2.isoverlapping(ann1))
print("Ann2 isbefore Ann3:", ann2.isbefore(ann3))
print("Ann3 isafter Ann2:", ann3.isafter(ann2))
print("Ann1 iscovering Ann5:", ann1.iscovering(ann5))
print("Ann3 iscoextensive Ann9:", ann3.iscoextensive(ann9))
print("Ann6 iswithin Ann1:", ann6.iswithin(ann1))
print("Ann4 isrightoverlapping Ann1:", ann4.isrightoverlapping(ann1))
Ann2 isoverlapping Ann1: True Ann2 isbefore Ann3: True Ann3 isafter Ann2: True Ann1 iscovering Ann5: True Ann3 iscoextensive Ann9: True Ann6 iswithin Ann1: True Ann4 isrightoverlapping Ann1: True
from gatenlp import Span
span1 = Span(3,4)
span2 = ann2.span
span3 = doc3.annset("set1").span
span4 = Span(ann5)
print([f"span{i}: {s}" for i, s in enumerate([span1, span2, span3, span4])])
['span0: Span(3,4)', 'span1: Span(0,6)', 'span2: Span(0,45)', 'span3: Span(12,18)']
set1 = doc3.annset("set1")
print("Within Ann1: ", [a.type for a in set1.within(ann1)])
print("Coextensive with Ann3:", [a.type for a in set1.coextensive(ann3)])
print("Coextensive with span of Ann3:", [a.type for a in set1.coextensive(ann3.span)])
Within Ann1: ['Ann10', 'Ann5', 'Ann3', 'Ann7', 'Ann9', 'Ann11', 'Ann6', 'Ann8', 'Ann12'] Coextensive with Ann3: ['Ann9'] Coextensive with span of Ann3: ['Ann3', 'Ann9']
print("Size of set1:", len(set1))
subset1 = set1.within(ann1)
print("Size of subset1:", len(subset1))
Size of set1: 12 Size of subset1: 9
# atry to add an annotation to subset1:
try:
subset1.add(2,3,"ANewOne")
except Exception as ex:
print("Got exception:", ex)
Got exception: Cannot add an annotation to an immutable annotation set
# make the set mutable and try again
subset1.immutable = False
subset1.add(2,3,"ANewOne")
print("Size of set1:", len(set1))
print("Size of subset1:", len(subset1))
print("Is set1 detached:", set1.isdetached())
print("Is subset1 detached:", subset1.isdetached())
Size of set1: 12 Size of subset1: 10 Is set1 detached: False Is subset1 detached: True
subset1
, NOT the original setsubset1.clone_anns()
Supported formats:
# lets load and view the main GateNLP documentation page:
doc4 = Document.load("https://gatenlp.github.io/python-gatenlp/", fmt="html")
doc4
Use: doc.show(annsets=["set1", ("set2", "type1"), ("set3", ["type1", "type2"])]
doc4.show(annsets=[("Original markups", ["h1","h2","a","li"])])
doc4.save("gatenlp-doc.html", fmt="html-ann-viewer", notebook=False, stretch_height=True)
from IPython.display import IFrame
IFrame("gatenlp-doc.html", 900,400)
offset_type
is either p
or j
doc = corpus[2]
/ corpus[3] = doc
store(doc)
to save a document to the index stored in the document featureappend(doc)
to add a new document to the corpusNone
None
indicates absence of documentNone
indicates that document should get removed or should not get updatedstore
methodfrom gatenlp.corpora import ListCorpus
texts = ["this is text one", "here is text two", "and this is text three"]
docs = [Document(t) for t in texts]
lcorp = ListCorpus(docs)
doc1 = lcorp[1]
print(doc1.features)
lcorp.store(doc1)
Features({'__idx': 1, '__idx_140568736615952': 1})
from gatenlp.corpora import DirFilesCorpus
corp1 = DirFilesCorpus("data/dir1") # get all the matching filenames from the directory
print("Number of documents:", len(corp1))
doc1 = corp1[2] # actually read the document from the directory
print("Text for idx=2:", doc1.text)
print("Features for idx=2:", doc1.features)
doc1.annset().add(0,len(doc1.text), "Document", dict(what="test document"))
# this writes the document back to the file:
corp1.store(doc1)
# could also have used: corp1[2] = doc1
Number of documents: 4 Text for idx=2: This is another document for testing which mentions John Smith. Features for idx=2: Features({'gate.SourceURL': 'created from String', '__idx_140568772663336': 2, '__idx': 2, '__relpath': 'doc2.bdocjs', '__abspath': 'data/dir1/doc2.bdocjs'})
NumberedDirFilesCorpus
: create a directory tree where the path represents digits of a large number000/002/341.bdoc
for element number 2341 of 600000000 totalEveryNthCorpus
: wrap a corpus and access only elements $k*i + o$ for $i = 0..\lfloor(n/k)\rfloor$ShuffledCorpus
: random re-ordering of the elements in the wrapped corpusCachedCorpus
: store retrieved elements from a (slow) base corpus in a (fast) cache corpusappend(doc)
to add Document instancesclose()
to end writingwith documentdestination as dest:
patternJsonLinesFileSource/Destination
: one line of bdocjs serialization per documentTsvFileSource
: one column in a TSV file contains the text, other columns can be stored in featuresPandasDfSource
: similar to TSV source, but for a Pandas data frameAnnotator
from gatenlp.corpora import ListCorpus
from gatenlp.processing.pipeline import Pipeline
from gatenlp.processing.annotator import AnnotatorFunction
from gatenlp.processing.executor import SerialCorpusExecutor
texts = ["Some text.", "Another text.", "Also some text here.", "And this is also some text."]
docs = [Document(t) for t in texts]
corp = ListCorpus(docs)
def annfunc1(doc):
doc.annset().add(0,3,"Ann1")
return doc
def annfunc2(doc):
doc.annset("set1").add(1,4,"Type1")
return doc
ann1 = AnnotatorFunction(annfunc1)
ann2 = AnnotatorFunction(annfunc2)
pipeline = Pipeline()
pipeline.add(ann1, name="FirstAnnotator")
pipeline.add(ann2, name="SecondAnnotator")
exe = SerialCorpusExecutor(pipeline, corpus=corp)
exe()
corp[2]
# use corp as source and create another ListCorpus as destination
corpnew = ListCorpus([])
exe2 = SerialCorpusExecutor(pipeline, source=corp, destination=corpnew)
exe2()
print("Length of corpnew:", len(corpnew))
corpnew[2]
Length of corpnew: 4
len(docs)
4
Preparation:
pip install -U spacy
python -m spacy download en_core_web_sm
import spacy
print(spacy.__version__)
from gatenlp.lib_spacy import AnnSpacy
nlp = spacy.load("en_core_web_sm")
annotator = AnnSpacy(pipeline=nlp, outsetname="Spacy")
doc2.annset("Spacy").clear() # avoid annotation duplication when running several times
doc2 = annotator(doc2)
doc2.show(htmlid="view2")
2.3.2
Preparation:
pip install -U stanza
python -c 'import stanza; stanza.download("en")'
import stanza
print(stanza.__version__)
from gatenlp.lib_stanza import AnnStanza
nlpstanza = stanza.Pipeline()
annotatorstanza = AnnStanza(pipeline=nlpstanza, outsetname="Stanza")
doc2.annset("Stanza").clear() # avoid annotation duplication when running several times
doc2 = annotatorstanza(doc2)
2021-02-06 19:46:59,441|INFO|stanza|Loading these models for language: en (English): ========================= | Processor | Package | ------------------------- | tokenize | combined | | pos | combined | | lemma | combined | | depparse | combined | | sentiment | sstplus | | ner | ontonotes | ========================= 2021-02-06 19:46:59,443|INFO|stanza|Use device: cpu 2021-02-06 19:46:59,443|INFO|stanza|Loading: tokenize 2021-02-06 19:46:59,449|INFO|stanza|Loading: pos
1.2
2021-02-06 19:46:59,672|INFO|stanza|Loading: lemma 2021-02-06 19:46:59,708|INFO|stanza|Loading: depparse 2021-02-06 19:47:00,069|INFO|stanza|Loading: sentiment 2021-02-06 19:47:00,498|INFO|stanza|Loading: ner 2021-02-06 19:47:01,102|INFO|stanza|Done loading processors!
doc2.show(htmlid="view3")
from gatenlp.processing.tokenizer import NLTKTokenizer
from nltk.tokenize.destructive import NLTKWordTokenizer # get some tokenizer to use
nltk_tokenizer = NLTKTokenizer(nltk_tokenizer=NLTKWordTokenizer(), out_set="", token_type="Token")
doc2.annset().clear()
doc2 = nltk_tokenizer(doc2)
doc2
1) Use prepared list with already tokenized entries
from gatenlp.processing.gazetteer import TokenGazetteer
gazlist1 = [
(["Donald", "Trump"], dict(what="person", country="US")),
(["Boris", "Johnson"], dict(what="person", country="UK")),
(["Google"], dict(what="company", country="Everywhere, really!"))
]
tgaz1 = TokenGazetteer(gazlist1, fmt="gazlist", outset="TGaz1", outtype="Lookup")
doc2.annset("TGaz1").clear()
doc2 = tgaz1(doc2)
Result when using the prepared list:
doc2.show()
2) Load a list from a file, using JAVA GATE "def" format (https://gate.ac.uk/userguide/sec:annie:gazetteer):
E.g. data/gaz1.def
:
persons.lst
companies.lst
data/persons.lst
:
Donald Trump what=person country=US
Boris Johnson what=person country=UK
data/companies.lst
:
Google where=Everywhere, really!
tgaz2 = TokenGazetteer("data/gaz1.def", fmt="gate-def", outset="TGaz2", outtype="Lookup", tokenizer=nltk_tokenizer)
doc2.annset("TGaz2").clear()
doc2 = tgaz2(doc2)
2021-02-06 19:47:03,016|INFO|gatenlp.processing.gazetteer|Reading list file data/persons.lst 2021-02-06 19:47:03,017|INFO|gatenlp.processing.gazetteer|Reading list file data/companies.lst
Result when using the loaded GATE-style gazetteer files:
doc2
Other features:
Planned: String Gazetteer
Lets create a rule that annotates any Token which is within a PERSON or ORG annotation:
from gatenlp.pam.pampac import Ann, AnnAt, Rule, Pampac, AddAnn, N, Seq, Or
from gatenlp.pam.matcher import FeatureMatcher, ifnot
r1 = Rule(
# first the pattern
Or ( Ann("Token", name="tok").within("ORG"),
Ann("Token", name="tok").within("PERSON")
),
# then the action for the pattern
AddAnn(name="tok", anntype="PersOrOrg")
)
# get the annotations we want to use for matching
anns2match = doc2.annset("Stanza").with_type(["Token", "PERSON", "ORG"])
outset = doc2.annset("Pampac1")
outset.clear()
# Create the Pampac instance from the single rule and run it on the annotations, also specify output set
# The run method returns the list of offsets and the action return values where the rule matches in the doc
Pampac(r1).run(doc2, anns2match, outset=outset)
len(outset)
15
doc2
Create a rule that annotates any Sequence of two or more Token annotations which have a upos tag of "PROPN", separated by at most one other arbitrary token:
from gatenlp.pam.pampac import Ann, AnnAt, Rule, Pampac, AddAnn, N, Seq
from gatenlp.pam.matcher import FeatureMatcher, ifnot
feat = FeatureMatcher(upos="PROPN")
r1 = Rule(
# first the pattern
Seq( Ann("Token", features=feat),
N( Seq( N(Ann("Token", features=ifnot(feat)), min=0, max=1),
Ann("Token", features=feat)),
min=1, max=99),
name="seq1"
),
# then the action for the pattern
AddAnn(name="seq1", anntype="PROPNSEQ")
)
# get the annotations we want to use for matching
anns2match = doc2.annset("Stanza").with_type("Token")
outset = doc2.annset("Pampac2")
outset.clear()
# Create the Pampac instance from the single rule and run it on the annotations, also specify output set
# The run method returns the list of offsets and the action return values where the rule matches in the doc
Pampac(r1).run(doc2, anns2match, outset=outset)
len(outset)
8
Result: found 5 matches and added annotations for them:
doc2
gatenlp-gate-slave
commandLet's try Option 3 first: GATE_HOME
environment variable must be set, or must know GATE installation directory
from gatenlp.gateslave import GateSlave
gs = GateSlave()
# if GATE_HOME not set use gs = GateSlave(gatehome="/where/Gate/is/Installed")
# if java is not on the PATH use gs = GateSlave(java=""/path/to/the/java/binary")
Trying to start GATE Slave on port=25333 host=127.0.0.1 log=false keep=false PythonSlaveRunning: starting server with 25333/127.0.0.1/VGaLZ6IHnYSqJxrkBWDChKxaWBM/false
# Create a GATE document on the JAVA GATE side and return a handle
gdoc1 = gs.createDocument("An example document mentioning Barack Obama and New York")
# Can call Java API methods on that handle and get/convert the result
print(gdoc1.getClass())
print(gdoc1.getName())
print(gdoc1.getAnnotationSetNames())
class gate.corpora.DocumentImpl GATE Document_00015 set()
# lets load the prepared ANNIE pipeline on the Java side and process the GATE document with it
gs.loadMavenPlugin("uk.ac.gate.plugins", "annie", "9.0-SNAPSHOT")
gpipe = gs.loadPipelineFromPlugin("uk.ac.gate.plugins", "annie", "/resources/ANNIE_with_defaults.gapp")
gcorp = gs.newCorpus()
gcorp.add(gdoc1)
gpipe.setCorpus(gcorp)
gpipe.execute()
So far, everything happened on the Java side, use a GateSlave API method to convert the document into a Python GateNLP document:
pdoc1 = gs.gdoc2pdoc(gdoc1)
pdoc1
gs.close()
gs.close()
An annotator to process Python GateNLP documents with a Java GATE pipeline
from gatenlp.gateslave import GateSlaveAnnotator
# Specify a prepared GATE pipeline file to get loaded into Java GATE
# optionally add the gatehome=... kw argument
# optionally specify port using port=23445 or similar
gs_app = GateSlaveAnnotator(pipeline="data/annie.xgapp", port=25444)
Trying to start GATE Slave on port=25444 host=127.0.0.1 log=false keep=false PythonSlaveRunning: starting server with 25444/127.0.0.1/3CE0PDSjyojcj9oaEal3e2wV5m8/false
Example, running on a directory corpus:
dircorpus = DirFilesCorpus("data/dir1", sort=True)
exe = SerialCorpusExecutor(annotator=gs_app, corpus=dircorpus)
exe()
gs_app.close()
tmpdoc = dircorpus[2]
print(tmpdoc.features)
tmpdoc
Features({'gate.SourceURL': 'created from String', '__idx_140568721279688': 2, '__idx': 2, '__relpath': 'doc3.bdocjs', '__abspath': 'data/dir1/doc3.bdocjs'})
gatenlp
package!gatenlp
installed separately as wellconda create -n gatenlp python=3.9
sortedcontainers
packageconda install -c conda-forge sortedcontainers
pythonProgram
init parametertest1.py
, click Open, then OKfrom gatenlp import Document, AnnotationSet, GateNlpPr, interact
@GateNlpPr
class MyAnnotator:
# the following method is run on every document, this method must exist:
def __call__(self, doc, **kwargs):
pass
# the start and finish methods are optional, if they exist the start
# method is called before the first document of a corpus and the finish
# method is called after the last document.
# def start(self, **kwargs):
# pass
# def finish(self, **kwargs):
# pass
# THE FOLLOWING MUST BE PRESENT SO THAT GATE CAN COMMUNICATE WITH THE PYTHON PROCESS!
if __name__ == "__main__":
interact()
from gatenlp import Document, AnnotationSet, GateNlpPr, interact
@GateNlpPr
class MyAnnotator:
def __init__(self):
self.n_docs = 0
def __call__(self, doc, **kwargs):
self.n_docs += 1
doc.annset().add(0,3,"SomeType")
doc.features["docnr"] = self.n_docs
def start(self, **kwargs):
print("Processing starting, we got kwargs:", kwargs)
self.n_docs = 0
def finish(self, **kwargs):
print("Processing finished, documents processed: ", self.n_docs)
if __name__ == "___main__": # NOTE: changed from __main__ to ___main__ to prevent running in Notebook!
interact()
interact()
communicates with the Java PythonPrstart
method is called, programParams
passed on__call__
is calledfinish
method is called and any results returned to JavaMore documentation: