Module gatenlp.lib_spacy
Support for using spacy: convert from spacy to gatenlp documents and annotations.
Expand source code
"""
Support for using spacy: convert from spacy to gatenlp documents and annotations.
"""
from gatenlp import Document
from gatenlp.processing.annotator import Annotator
import spacy
class AnnSpacy(Annotator):
""" """
def __init__(self, pipeline=None,
outsetname="",
token_type="Token",
spacetoken_type="SpaceToken",
sentence_type="Sentence",
nounchunk_type="NounChunk",
add_tokens=True,
# add_spacetokens=True, # not sure how to do this yet
add_entities=True,
add_sentences=True,
add_nounchunks=True,
add_deps=True,
ent_prefix=None,
):
"""
Create an annotator for running a spacy pipeline on documents.
:param pipeline: if this is specified, a pre-configured spacy pipeline (default: "en_core_web_sm"
pipeline)
:param outsetname: the annotation set name where to put the annotations
:param token_type: the annotation type for the token annotations
:param spacetoken_type: type of any space token annotations
:param sentence_type: the annotation type for the sentence annotations
:param nounchunk_type: annotation type for noun chunks
:param add_tokens: if token annotations should be added
:param add_entities: if true, add entity annotations
:param add_sentences: if sentence annotations should be added
:param add_nounchunks: if nounchunks should be added
:param add_deps: if dependencies should be added
:param ent_prefix: the prefix to add to all entity annotation types
:param kwargs: if no preconfigured pipeline is specified, pass these arguments to
the stanza.Pipeline() constructor see https://stanfordnlp.github.io/stanza/pipeline.html#pipeline
"""
self.outsetname = outsetname
self.token_type = token_type
self.sentence_type = sentence_type
self.add_entities = add_entities
self.ent_prefix = ent_prefix
self.spacetoken_type = spacetoken_type
self.nounchunk_type = nounchunk_type
self.add_tokens = add_tokens
self.add_sentences = add_sentences
self.add_nounchunks = add_nounchunks
self.add_deps = add_deps
if pipeline:
self.pipeline = pipeline
else:
self.pipeline = spacy.load("en_core_web_sm")
def __call__(self, doc, **kwargs):
spacy_doc = self.pipeline(doc.text)
spacy2gatenlp(spacy_doc, doc,
setname=self.outsetname,
token_type=self.token_type,
spacetoken_type=self.spacetoken_type,
sentence_type=self.sentence_type,
nounchunk_type=self.nounchunk_type,
add_tokens=self.add_tokens,
add_ents=self.add_entities,
add_nounchunks=self.add_nounchunks,
add_sents=self.add_sentences,
add_dep=self.add_deps,
ent_prefix=self.ent_prefix,
)
return doc
def apply_spacy(nlp, gatenlpdoc, setname=""):
"""Run the spacy nlp pipeline on the gatenlp document and transfer the annotations.
This modifies the gatenlp document in place.
Args:
nlp: spacy pipeline
gatenlpdoc: gatenlp document
setname: annotation set to receive the annotations (Default value = "")
tokens: an annotation set containing already known token annotations
Returns:
"""
spacydoc = nlp(gatenlpdoc.text)
return spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname)
def spacy2gatenlp(spacydoc, gatenlpdoc=None, setname="", token_type="Token",
spacetoken_type="SpaceToken", sentence_type="Sentence",
nounchunk_type="NounChunk",
add_tokens=True,
# add_spacetokens=True, # not sure how to do this yet
add_ents=True,
add_sents=True,
add_nounchunks=True,
add_dep=True,
ent_prefix=None,
):
"""Convert a spacy document to a gatenlp document. If a gatenlp document is already
provided, add the annotations from the spacy document to it. In this case the
original gatenlpdoc is used and gets modified.
Args:
spacydoc: a spacy document
gatenlpdoc: if None, a new gatenlp document is created otherwise this
document is added to. (Default value = None)
setname: the annotation set name to which the annotations get added, empty string
for the default annotation set.
token_type: the annotation type to use for tokens (Default value = "Token")
spacetoken_type: the annotation type to use for space tokens (Default value = "SpaceToken")
sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk")
add_tokens: should annotations for tokens get added? If not, dependency parser
info cannot be added either. (Default value = True)
add_ents: should annotations for entities get added
add_sents: should sentence annotations get added (Default value = True)
add_nounchunks: should noun chunk annotations get added (Default value = True)
add_dep: should dependency parser information get added (Default value = True)
# add_spacetokens: (Default value = True)
# not sure how to do this yetadd_ents: (Default value = True)
ent_prefix: (Default value = None)
Returns:
the new or modified
"""
if gatenlpdoc is None:
retdoc = Document(spacydoc.text)
else:
retdoc = gatenlpdoc
toki2annid = {}
annset = retdoc.annset(setname)
for tok in spacydoc:
from_off = tok.idx
to_off = tok.idx + len(tok)
is_space = tok.is_space
fm = {
"_i": tok.i,
"is_alpha": tok.is_alpha,
"is_bracket": tok.is_bracket,
"is_currency": tok.is_currency,
"is_digit": tok.is_digit,
"is_left_punct": tok.is_left_punct,
"is_lower": tok.is_lower,
"is_oov": tok.is_oov,
"is_punct": tok.is_punct,
"is_quote": tok.is_quote,
"is_right_punct": tok.is_right_punct,
"is_sent_start": tok.is_sent_start,
"is_space": tok.is_space,
"is_stop": tok.is_stop,
"is_title": tok.is_title,
"is_upper": tok.is_upper,
"lang": tok.lang_,
"lemma": tok.lemma_,
"like_email": tok.like_email,
"like_num": tok.like_num,
"like_url": tok.like_url,
"orth": tok.orth,
"pos": tok.pos_,
"prefix": tok.prefix_,
"prob": tok.prob,
"rank": tok.rank,
"sentiment": tok.sentiment,
"tag": tok.tag_,
"shape": tok.shape_,
"suffix": tok.suffix_,
}
if spacydoc.is_nered and add_ents:
fm["ent_type"] = tok.ent_type_
if spacydoc.is_parsed and add_dep:
fm["dep"] = tok.dep_
if tok.is_space:
anntype = spacetoken_type
else:
anntype = token_type
annid = annset.add(from_off, to_off, anntype, fm).id
toki2annid[tok.i] = annid
# print("Added annotation with id: {} for token {}".format(annid, tok.i))
ws = tok.whitespace_
if len(ws) > 0:
annset.add(to_off, to_off+len(ws), spacetoken_type, {"is_space": True})
# if we have a dependency parse, now also add the parse edges
if spacydoc.is_parsed and add_tokens and add_dep:
for tok in spacydoc:
ann = annset.get(toki2annid[tok.i])
ann.features["head"] = toki2annid[tok.head.i]
ann.features["left_edge"] = toki2annid[tok.left_edge.i]
ann.features["right_edge"] = toki2annid[tok.right_edge.i]
if spacydoc.ents and add_ents:
for ent in spacydoc.ents:
if ent_prefix:
entname = ent_prefix + ent.label_
else:
entname = ent.label_
annset.add(ent.start_char, ent.end_char, entname, {"lemma": ent.lemma_})
if spacydoc.sents and add_sents:
for sent in spacydoc.sents:
annset.add(sent.start_char, sent.end_char, sentence_type, {})
if spacydoc.noun_chunks and add_nounchunks:
for chunk in spacydoc.noun_chunks:
annset.add(chunk.start_char, chunk.end_char, nounchunk_type, {})
return retdoc
Functions
def apply_spacy(nlp, gatenlpdoc, setname='')
-
Run the spacy nlp pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place.
Args
nlp
- spacy pipeline
gatenlpdoc
- gatenlp document
setname
- annotation set to receive the annotations (Default value = "")
tokens
- an annotation set containing already known token annotations
Returns:
Expand source code
def apply_spacy(nlp, gatenlpdoc, setname=""): """Run the spacy nlp pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place. Args: nlp: spacy pipeline gatenlpdoc: gatenlp document setname: annotation set to receive the annotations (Default value = "") tokens: an annotation set containing already known token annotations Returns: """ spacydoc = nlp(gatenlpdoc.text) return spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname)
def spacy2gatenlp(spacydoc, gatenlpdoc=None, setname='', token_type='Token', spacetoken_type='SpaceToken', sentence_type='Sentence', nounchunk_type='NounChunk', add_tokens=True, add_ents=True, add_sents=True, add_nounchunks=True, add_dep=True, ent_prefix=None)
-
Convert a spacy document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the spacy document to it. In this case the original gatenlpdoc is used and gets modified.
Args
spacydoc
- a spacy document
gatenlpdoc
- if None, a new gatenlp document is created otherwise this
document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens (Default value = "Token") spacetoken_type: the annotation type to use for space tokens (Default value = "SpaceToken") sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk") add_tokens: should annotations for tokens get added? If not, dependency parser info cannot be added either. (Default value = True) add_ents: should annotations for entities get added add_sents: should sentence annotations get added (Default value = True) add_nounchunks: should noun chunk annotations get added (Default value = True) add_dep: should dependency parser information get added (Default value = True) # add_spacetokens: (Default value = True) # not sure how to do this yetadd_ents: (Default value = True) ent_prefix: (Default value = None)
Returns
the new or modified
Expand source code
def spacy2gatenlp(spacydoc, gatenlpdoc=None, setname="", token_type="Token", spacetoken_type="SpaceToken", sentence_type="Sentence", nounchunk_type="NounChunk", add_tokens=True, # add_spacetokens=True, # not sure how to do this yet add_ents=True, add_sents=True, add_nounchunks=True, add_dep=True, ent_prefix=None, ): """Convert a spacy document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the spacy document to it. In this case the original gatenlpdoc is used and gets modified. Args: spacydoc: a spacy document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens (Default value = "Token") spacetoken_type: the annotation type to use for space tokens (Default value = "SpaceToken") sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk") add_tokens: should annotations for tokens get added? If not, dependency parser info cannot be added either. (Default value = True) add_ents: should annotations for entities get added add_sents: should sentence annotations get added (Default value = True) add_nounchunks: should noun chunk annotations get added (Default value = True) add_dep: should dependency parser information get added (Default value = True) # add_spacetokens: (Default value = True) # not sure how to do this yetadd_ents: (Default value = True) ent_prefix: (Default value = None) Returns: the new or modified """ if gatenlpdoc is None: retdoc = Document(spacydoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) for tok in spacydoc: from_off = tok.idx to_off = tok.idx + len(tok) is_space = tok.is_space fm = { "_i": tok.i, "is_alpha": tok.is_alpha, "is_bracket": tok.is_bracket, "is_currency": tok.is_currency, "is_digit": tok.is_digit, "is_left_punct": tok.is_left_punct, "is_lower": tok.is_lower, "is_oov": tok.is_oov, "is_punct": tok.is_punct, "is_quote": tok.is_quote, "is_right_punct": tok.is_right_punct, "is_sent_start": tok.is_sent_start, "is_space": tok.is_space, "is_stop": tok.is_stop, "is_title": tok.is_title, "is_upper": tok.is_upper, "lang": tok.lang_, "lemma": tok.lemma_, "like_email": tok.like_email, "like_num": tok.like_num, "like_url": tok.like_url, "orth": tok.orth, "pos": tok.pos_, "prefix": tok.prefix_, "prob": tok.prob, "rank": tok.rank, "sentiment": tok.sentiment, "tag": tok.tag_, "shape": tok.shape_, "suffix": tok.suffix_, } if spacydoc.is_nered and add_ents: fm["ent_type"] = tok.ent_type_ if spacydoc.is_parsed and add_dep: fm["dep"] = tok.dep_ if tok.is_space: anntype = spacetoken_type else: anntype = token_type annid = annset.add(from_off, to_off, anntype, fm).id toki2annid[tok.i] = annid # print("Added annotation with id: {} for token {}".format(annid, tok.i)) ws = tok.whitespace_ if len(ws) > 0: annset.add(to_off, to_off+len(ws), spacetoken_type, {"is_space": True}) # if we have a dependency parse, now also add the parse edges if spacydoc.is_parsed and add_tokens and add_dep: for tok in spacydoc: ann = annset.get(toki2annid[tok.i]) ann.features["head"] = toki2annid[tok.head.i] ann.features["left_edge"] = toki2annid[tok.left_edge.i] ann.features["right_edge"] = toki2annid[tok.right_edge.i] if spacydoc.ents and add_ents: for ent in spacydoc.ents: if ent_prefix: entname = ent_prefix + ent.label_ else: entname = ent.label_ annset.add(ent.start_char, ent.end_char, entname, {"lemma": ent.lemma_}) if spacydoc.sents and add_sents: for sent in spacydoc.sents: annset.add(sent.start_char, sent.end_char, sentence_type, {}) if spacydoc.noun_chunks and add_nounchunks: for chunk in spacydoc.noun_chunks: annset.add(chunk.start_char, chunk.end_char, nounchunk_type, {}) return retdoc
Classes
class AnnSpacy (pipeline=None, outsetname='', token_type='Token', spacetoken_type='SpaceToken', sentence_type='Sentence', nounchunk_type='NounChunk', add_tokens=True, add_entities=True, add_sentences=True, add_nounchunks=True, add_deps=True, ent_prefix=None)
-
Create an annotator for running a spacy pipeline on documents.
:param pipeline: if this is specified, a pre-configured spacy pipeline (default: "en_core_web_sm" pipeline) :param outsetname: the annotation set name where to put the annotations :param token_type: the annotation type for the token annotations :param spacetoken_type: type of any space token annotations :param sentence_type: the annotation type for the sentence annotations :param nounchunk_type: annotation type for noun chunks :param add_tokens: if token annotations should be added :param add_entities: if true, add entity annotations :param add_sentences: if sentence annotations should be added :param add_nounchunks: if nounchunks should be added :param add_deps: if dependencies should be added :param ent_prefix: the prefix to add to all entity annotation types :param kwargs: if no preconfigured pipeline is specified, pass these arguments to the stanza.Pipeline() constructor see https://stanfordnlp.github.io/stanza/pipeline.html#pipeline
Expand source code
class AnnSpacy(Annotator): """ """ def __init__(self, pipeline=None, outsetname="", token_type="Token", spacetoken_type="SpaceToken", sentence_type="Sentence", nounchunk_type="NounChunk", add_tokens=True, # add_spacetokens=True, # not sure how to do this yet add_entities=True, add_sentences=True, add_nounchunks=True, add_deps=True, ent_prefix=None, ): """ Create an annotator for running a spacy pipeline on documents. :param pipeline: if this is specified, a pre-configured spacy pipeline (default: "en_core_web_sm" pipeline) :param outsetname: the annotation set name where to put the annotations :param token_type: the annotation type for the token annotations :param spacetoken_type: type of any space token annotations :param sentence_type: the annotation type for the sentence annotations :param nounchunk_type: annotation type for noun chunks :param add_tokens: if token annotations should be added :param add_entities: if true, add entity annotations :param add_sentences: if sentence annotations should be added :param add_nounchunks: if nounchunks should be added :param add_deps: if dependencies should be added :param ent_prefix: the prefix to add to all entity annotation types :param kwargs: if no preconfigured pipeline is specified, pass these arguments to the stanza.Pipeline() constructor see https://stanfordnlp.github.io/stanza/pipeline.html#pipeline """ self.outsetname = outsetname self.token_type = token_type self.sentence_type = sentence_type self.add_entities = add_entities self.ent_prefix = ent_prefix self.spacetoken_type = spacetoken_type self.nounchunk_type = nounchunk_type self.add_tokens = add_tokens self.add_sentences = add_sentences self.add_nounchunks = add_nounchunks self.add_deps = add_deps if pipeline: self.pipeline = pipeline else: self.pipeline = spacy.load("en_core_web_sm") def __call__(self, doc, **kwargs): spacy_doc = self.pipeline(doc.text) spacy2gatenlp(spacy_doc, doc, setname=self.outsetname, token_type=self.token_type, spacetoken_type=self.spacetoken_type, sentence_type=self.sentence_type, nounchunk_type=self.nounchunk_type, add_tokens=self.add_tokens, add_ents=self.add_entities, add_nounchunks=self.add_nounchunks, add_sents=self.add_sentences, add_dep=self.add_deps, ent_prefix=self.ent_prefix, ) return doc
Ancestors
- Annotator
- abc.ABC
Inherited members