Module gatenlp.processing.executor

Expand source code
from gatenlp.processing.pipeline import _has_method

__pdoc__ = {
    "Annotator.__call__": True
}


class SerialCorpusExecutor:
    """
    Runs a pipeline on either a corpus, where each document gets in the corpus gets processed and stored back
    in turn, or on a source and destination, where each document from the source gets processed and all documents
    the are the result of processing get appended to the destination.
    """
    def __init__(self,
                 annotator,
                 corpus=None,
                 source=None,
                 destination=None,
                 readonly=False,
                 exit_on_error=False
                 ):
        """
        Creates an Executor to run an annotator on either a corpus or a document source. If a corpus is specified,
        and no destination is specified,
        the document passed on to the annotator must be returned by the annotator and gets stored back into the
        corpus, unless readonly is True.

        If a corpus is specified and a destination is specified, the corpus is iterated over in sequence and
        documents are processed by the annotator and all documents returned by the annotator are appended to the
        destination.

        If a document source is processed, the document gets processed and the annotator can return zero,
        one or several documents which are appended to the destination unless readonly is set to True.

        An exception is thrown if both a courpus and a source are specified.

        Args:
            annotator: the callable to run on each document. If this is an instance of Annotator, the additional
              methods start, finish, and reduce are called as appropriate
            corpus: the corpus to process.
            source: a document source to process. Corpus and source are mutually exclusive.
            destination: if specified, the result documents are appended to the destination unless
              readonly is True.
            readonly: if True, nothing is saved back to the corpus or appended to the destination.
        """
        if (corpus is None and source is None) or (corpus is not None and source is not None):
            raise Exception("Exactly one of corpus or source must be specified")
        self.corpus = corpus
        self.source = source
        self.destination = destination
        self.annotator = annotator
        self.readonly = readonly
        self.exit_on_error = exit_on_error
        self.n_in = 0
        self.n_none = 0  # number of None items from the corpus/source, ignored
        self.n_out = 0
        self.n_err = 0

    def __call__(self, **kwargs):
        if _has_method(self.annotator, "start"):
            self.annotator.init()
        if self.corpus:
            for idx, doc in enumerate(self.corpus):
                self.n_in += 1
                if doc is None:
                    self.n_none += 1
                    continue
                try:
                    ret = self.annotator(doc, **kwargs)
                except Exception as ex:
                    self.n_err += 1
                    if self.exit_on_error:
                        return
                    else:
                        continue
                if self.destination is None:
                    if id(ret) != id(doc):
                        raise Exception("Cannot update corpus if Annotator does not return the processed document")
                    self.corpus[idx] = doc
                else:
                    if ret is not None:
                        if isinstance(ret, list):
                            for d in ret:
                                self.destination.append(d)
                                self.n_out += 1
                        else:
                            self.destination.append(ret)
                            self.n_out += 1
        else:
            for doc in self.source:
                self.n_in += 1
                if doc is None:
                    self.n_none += 1
                    continue
                try:
                    ret = self.annotator(doc, **kwargs)
                except Exception as ex:
                    self.n_err += 1
                    if self.exit_on_error:
                        return
                    else:
                        continue
                if ret is not None:
                    if isinstance(ret, list):
                        for d in ret:
                            self.destination.append(d)
                            self.n_out += 1
                    else:
                        self.destination.append(ret)
                        self.n_out += 1
        if _has_method(self.annotator, "finish"):
            rets = self.annotator.init()
            return rets
        else:
            return None
        # NOTE: since this is single-threaded, no reduce call is necessary!

Classes

class SerialCorpusExecutor (annotator, corpus=None, source=None, destination=None, readonly=False, exit_on_error=False)

Runs a pipeline on either a corpus, where each document gets in the corpus gets processed and stored back in turn, or on a source and destination, where each document from the source gets processed and all documents the are the result of processing get appended to the destination.

Creates an Executor to run an annotator on either a corpus or a document source. If a corpus is specified, and no destination is specified, the document passed on to the annotator must be returned by the annotator and gets stored back into the corpus, unless readonly is True.

If a corpus is specified and a destination is specified, the corpus is iterated over in sequence and documents are processed by the annotator and all documents returned by the annotator are appended to the destination.

If a document source is processed, the document gets processed and the annotator can return zero, one or several documents which are appended to the destination unless readonly is set to True.

An exception is thrown if both a courpus and a source are specified.

Args

annotator
the callable to run on each document. If this is an instance of Annotator, the additional methods start, finish, and reduce are called as appropriate
corpus
the corpus to process.
source
a document source to process. Corpus and source are mutually exclusive.
destination
if specified, the result documents are appended to the destination unless readonly is True.
readonly
if True, nothing is saved back to the corpus or appended to the destination.
Expand source code
class SerialCorpusExecutor:
    """
    Runs a pipeline on either a corpus, where each document gets in the corpus gets processed and stored back
    in turn, or on a source and destination, where each document from the source gets processed and all documents
    the are the result of processing get appended to the destination.
    """
    def __init__(self,
                 annotator,
                 corpus=None,
                 source=None,
                 destination=None,
                 readonly=False,
                 exit_on_error=False
                 ):
        """
        Creates an Executor to run an annotator on either a corpus or a document source. If a corpus is specified,
        and no destination is specified,
        the document passed on to the annotator must be returned by the annotator and gets stored back into the
        corpus, unless readonly is True.

        If a corpus is specified and a destination is specified, the corpus is iterated over in sequence and
        documents are processed by the annotator and all documents returned by the annotator are appended to the
        destination.

        If a document source is processed, the document gets processed and the annotator can return zero,
        one or several documents which are appended to the destination unless readonly is set to True.

        An exception is thrown if both a courpus and a source are specified.

        Args:
            annotator: the callable to run on each document. If this is an instance of Annotator, the additional
              methods start, finish, and reduce are called as appropriate
            corpus: the corpus to process.
            source: a document source to process. Corpus and source are mutually exclusive.
            destination: if specified, the result documents are appended to the destination unless
              readonly is True.
            readonly: if True, nothing is saved back to the corpus or appended to the destination.
        """
        if (corpus is None and source is None) or (corpus is not None and source is not None):
            raise Exception("Exactly one of corpus or source must be specified")
        self.corpus = corpus
        self.source = source
        self.destination = destination
        self.annotator = annotator
        self.readonly = readonly
        self.exit_on_error = exit_on_error
        self.n_in = 0
        self.n_none = 0  # number of None items from the corpus/source, ignored
        self.n_out = 0
        self.n_err = 0

    def __call__(self, **kwargs):
        if _has_method(self.annotator, "start"):
            self.annotator.init()
        if self.corpus:
            for idx, doc in enumerate(self.corpus):
                self.n_in += 1
                if doc is None:
                    self.n_none += 1
                    continue
                try:
                    ret = self.annotator(doc, **kwargs)
                except Exception as ex:
                    self.n_err += 1
                    if self.exit_on_error:
                        return
                    else:
                        continue
                if self.destination is None:
                    if id(ret) != id(doc):
                        raise Exception("Cannot update corpus if Annotator does not return the processed document")
                    self.corpus[idx] = doc
                else:
                    if ret is not None:
                        if isinstance(ret, list):
                            for d in ret:
                                self.destination.append(d)
                                self.n_out += 1
                        else:
                            self.destination.append(ret)
                            self.n_out += 1
        else:
            for doc in self.source:
                self.n_in += 1
                if doc is None:
                    self.n_none += 1
                    continue
                try:
                    ret = self.annotator(doc, **kwargs)
                except Exception as ex:
                    self.n_err += 1
                    if self.exit_on_error:
                        return
                    else:
                        continue
                if ret is not None:
                    if isinstance(ret, list):
                        for d in ret:
                            self.destination.append(d)
                            self.n_out += 1
                    else:
                        self.destination.append(ret)
                        self.n_out += 1
        if _has_method(self.annotator, "finish"):
            rets = self.annotator.init()
            return rets
        else:
            return None