Module gatenlp.document

Module that implements the Document class for representing gatenlp documents with features and annotation sets.

Expand source code
"""
Module that implements the Document class for representing gatenlp documents with features and annotation sets.
"""

from typing import KeysView
from gatenlp.annotation_set import AnnotationSet
from gatenlp.annotation import Annotation
from gatenlp.changelog import *
from gatenlp.features import Features
import logging
import importlib
import copy as lib_copy
from gatenlp.gatenlpconfig import gatenlpconfig

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class Document:
    """Represent a GATE document. This is different from the original Java GATE representation in several ways:
    
    * the text is not mutable and can only be set at creation time, so there is no "edit" method
    
    * as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is
      no separate "FeatureMap" to store them
    
    * does not support listener callbacks
    * there is no separate abstraction for "content", the only content possible is text which is a unicode string
      that can be acessed with the "text()" method
    * Spans of text can be directly accessed using doc[from:to]
    * Features may only have string keys and values which can be json-serialised
    * Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets
      are UTF-16 Unicode code units
    * Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index
      of UTF-16 code unit and back)
    * No part of the document has to be present, not even the text (this allows saving just the annotations separately
      from the text)
    * Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly)

    Args:
      text: the text of the document. The text can be None to indicate that no initial text should be set. Once
    the text has been set for a document, it is immutable and cannot be changed.
      features: the initial document features to set, a sequence of key/value tuples
      changelog: a ChangeLog instance to use to log changes.

    Returns:

    """

    def __init__(self, text: str = None, features=None, changelog: ChangeLog = None):
        if text is not None:
            assert isinstance(text, str)
        if changelog is not None:
            assert isinstance(changelog, ChangeLog)
        self._changelog = changelog
        self._features = Features(features, logger=self._log_feature_change)
        self._annotation_sets = dict()
        self._text = text
        self.offset_type = OFFSET_TYPE_PYTHON
        self._name = ""

    @property
    def name(self):
        """ """
        return self._name

    @name.setter
    def name(self, val):
        """

        Args:
          val: 

        Returns:

        """
        if val is None:
            val = ""
        if not isinstance(val, str):
            raise Exception("Name must be a string")
        self._name = val
        if self._changelog is not None:
            ch = {"command": "name:set"}
            ch["name"] = val
            self._changelog.append(ch)

    def _ensure_type_python(self) -> None:
        """ """
        if self.offset_type != OFFSET_TYPE_PYTHON:
            raise Exception("Document cannot be used if it is not type PYTHON, use to_type(OFFSET_TYPE_PYTHON) first")

    def _fixup_annotations(self, method: Callable) -> None:
        """

        Args:
          method: Callable: 

        Returns:

        """
        annset_names = self._annotation_sets.keys()
        for annset_name in annset_names:
            annset = self._annotation_sets[annset_name]
            if annset._annotations is not None:
                for ann in annset._annotations.values():
                    ann._start = method(ann._start)
                    ann._end = method(ann._end)

    def to_offset_type(self, offsettype: str) -> OffsetMapper:
        """Convert all the offsets of all the annotations in this document to the
        required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets
        are already of that type, this does nothing.
        
        NOTE: if the document has a ChangeLog, it is NOT also converted!
        
        The method returns the offset mapper if anything actually was converted,
        otherwise None.

        Args:
          offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
          offsettype: str: 

        Returns:
          offset mapper or None

        """
        om = None
        if offsettype == self.offset_type:
            return
        if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON:
            # convert from currently python to java
            om = OffsetMapper(self._text)
            self._fixup_annotations(om.convert_to_java)
            self.offset_type = OFFSET_TYPE_JAVA
        elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA:
            # convert from currently java to python
            om = OffsetMapper(self._text)
            self._fixup_annotations(om.convert_to_python)
            self.offset_type = OFFSET_TYPE_PYTHON
        else:
            raise Exception("Odd offset type")
        return om

    def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID):
        """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance,
        a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.
        
        The document is modified in-place.

        Args:
          changes: one or more changes
          handle_existing_anns: what to do if the change from the changelog tries to add an annotation
        with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)

        Returns:

        """
        if isinstance(changes, dict):
            changes = [changes]
        elif isinstance(changes, ChangeLog):
            changes = changes.changes
        for change in changes:
            cmd = change.get("command")
            fname = change.get("feature")
            fvalue = change.get("value")
            features = change.get("features")
            sname = change.get("set")
            annid = change.get("id")
            if cmd is None:
                raise Exception("Change without field 'command'")
            if cmd == ACTION_ADD_ANNSET:
                assert sname is not None
                self.annset(sname)
            elif cmd == ACTION_ADD_ANN:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                start = change.get("start")
                end = change.get("end")
                anntype = change.get("type")

                if ann is None:
                    anns.add(start, end, anntype, annid=annid, features=features)
                else:
                    if handle_existing_anns == ADDANN_IGNORE:
                        pass
                    elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID:
                        anns.add(start, end, anntype)
                    elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION:
                        anns.remove(annid)
                        anns.add(start, end, anntype, annid)
                    elif handle_existing_anns == ADDANN_UPDATE_FEATURES:
                        ann.features.update(features)
                    elif handle_existing_anns == ADDANN_REPLACE_FEATURES:
                        ann.features.clear()
                        ann.features.update(features)
                    elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES:
                        fns = ann.feature_names()
                        for f in features.keys():
                            if f not in fns:
                                ann.features[f] = features[f]

            elif cmd == ACTION_CLEAR_ANNS:
                assert sname is not None
                anns = self.annset(sname)
                anns.clear()
            elif cmd == ACTION_CLEAR_ANN_FEATURES:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                if ann is not None:
                    ann.features.clear()
                else:
                    pass # ignore, could happen with a detached annotation
            elif cmd == ACTION_CLEAR_DOC_FEATURES:
                self.features.clear()
            elif cmd == ACTION_SET_ANN_FEATURE:
                assert fname is not None
                assert sname is not None
                assert annid is not None
                ann = self.annset(sname).get(annid)
                ann.features[fname] = fvalue
            elif cmd == ACTION_DEL_ANN_FEATURE:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                if ann is not None:
                    if fname is not None:
                        ann.features.pop(fname, None)
                else:
                    pass  # ignore, could happen with a detached annotation
            elif cmd == ACTION_DEL_DOC_FEATURE:
                assert fname is not None
                self.features.pop(fname, None)
            elif cmd == ACTION_DEL_ANN:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                anns.remove(annid)
            elif cmd == ACTION_SET_DOC_FEATURE:
                assert fname is not None
                self.features[fname] = fvalue
            elif cmd == ACTION_CLEAR_DOC_FEATURES:
                self._features.clear()
            elif cmd == ACTION_DEL_DOC_FEATURE:
                assert fname is not None
                del self._features[fname]
            else:
                raise Exception("Unknown ChangeLog action: ", cmd)

    @property
    def features(self):
        """Accesses the features as a FeatureViewer instance. Changes made on this object are
        reflected in the document and recorded in the change log, if there is one.
        
        :return: A FeatureViewer view of the document features.

        Args:

        Returns:

        """
        return self._features


    @property
    def changelog(self):
        """Get the ChangeLog or None if no ChangeLog has been set.
        
        :return: the changelog

        Args:

        Returns:

        """
        return self._changelog

    @changelog.setter
    def changelog(self, chlog):
        """Make the document use the given changelog to record all changes
        from this moment on.

        Args:
          chlog: the new changelog to use or None to not use any

        Returns:
          the changelog used previously or None

        """
        oldchlog = self._changelog
        self._changelog = chlog
        return oldchlog

    @property
    def text(self) -> str:
        """Get the text of the document. For a partial document, the text may be None.
        
        :return: the text of the document

        Args:

        Returns:

        """
        self._ensure_type_python()
        return self._text

    @text.setter
    def text(self, value: str) -> None:
        """Set the text of the document. This is only possible as long as it has not been set
        yet, after that, the text is immutable.

        Args:
          value: the text for the document
          value: str: 

        Returns:

        """
        if self._text is None:
            self._text = value
        else:
            raise NotImplementedError("Text cannot be modified")

    def _log_feature_change(self, command: str, feature: str = None, value=None) -> None:
        """

        Args:
          command: str: 
          feature: str:  (Default value = None)
          value:  (Default value = None)

        Returns:

        """
        if self._changelog is None:
            return
        command = "doc-"+command
        ch = {"command": command}
        if command == "doc-feature:set":
            ch["feature"] = feature
            ch["value"] = value
        self._changelog.append(ch)

    def __len__(self) -> int:
        """
        Return the length of the text.
        Note: this will convert the type of the document to python!

        :return: the length of the document text
        """
        self._ensure_type_python()
        if self._text is None:
            return 0
        else:
            return len(self._text)

    def __getitem__(self, span) -> str:
        """
        Get the text for the given span.

        :param span: a single number, an offset range of the form from:to or an annotation.
        If annotation, uses the annotation's offset span.
        :return: the text of the span
        """
        self._ensure_type_python()
        if isinstance(span, Annotation):
            return self.text[span._start:span._end]
        if isinstance(span, AnnotationSet):
            return self.text[span.start():span.end()]
        return self.text[span]

    def annset(self, name: str = "") -> AnnotationSet:
        """Get the named annotation set, if name is not given or the empty string, the default annotation set.
        If the annotation set does not already exist, it is created.

        Args:
          name: the annotation set name, the empty string is used for the "default annotation set".
          name: str:  (Default value = "")

        Returns:
          the specified annotation set.

        """
        self._ensure_type_python()
        if name not in self._annotation_sets:
            annset = AnnotationSet(owner_doc=self, name=name)
            self._annotation_sets[name] = annset
            if self._changelog:
                self._changelog.append({
                    "command": "annotations:add",
                    "set": name})
            return annset
        else:
            return self._annotation_sets[name]

    def annset_names(self) -> KeysView[str]:
        """

        Args:

        Returns:
          :return: annotation set names

        """
        self._ensure_type_python()
        return list(self._annotation_sets.keys())
    
    def remove_annset(self, name: str):
        """Completely remove the annotation set.

        Args:
          name: name of the annotation set to remove
          name: str: 

        Returns:

        """
        if name not in self._annotation_sets:
            raise Exception(f"AnnotationSet with name {name} does not exist")
        del self._annotation_sets[name]
        if self._changelog:
            self._changelog.append({
                "command": "annotations:remove",
                "set": name})

    def __repr__(self) -> str:
        """
        String representation of the document, showing all content.

        :return: string representation
        """
        return "Document({},features={},anns={})".format(self.text, self._features, self._annotation_sets.__repr__())

    def __str__(self) -> str:
        asets = "["+",".join([f"'{k}':{len(v)}" for k, v in self._annotation_sets.items()])+"]"
        return "Document({},features={},anns={})".format(self.text, self._features, asets)

    def to_dict(self, offset_type=None, **kwargs):
        """Convert this instance to a dictionary that can be used to re-create the instance with
        from_dict.
        NOTE: if there is an active changelog, it is not included in the output as this
        field is considered a transient field!

        Args:
          offset_type: convert to the given offset type on the fly (Default value = None)
          **kwargs: 

        Returns:
          the dictionary representation of this instance

        """
        # if the specified offset type is equal to what we have, do nothing, otherwise
        # create an offset mapper and pass it down to where we actually convert the annotations

        om = None
        if offset_type is not None:
            assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON
            if offset_type != self.offset_type:
                if self._text is not None:
                    om = OffsetMapper(self._text)
                    kwargs["offset_mapper"] = om
                    kwargs["offset_type"] = offset_type
        else:
            offset_type = self.offset_type

        return {
            "annotation_sets": {name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() },
            "text": self._text,
            "features": self._features.to_dict(),
            "offset_type": offset_type,
            "name": self.name,
        }

    @staticmethod
    def from_dict(dictrepr, **kwargs):
        """Return a Document instance as represented by the dictionary dictrepr.

        Args:
          dictrepr: return: the initialized Document instance
          **kwargs:

        Returns:
          the initialized Document instance

        """
        feats = dictrepr.get("features")
        doc = Document(dictrepr.get("text"), features=feats)
        doc.name = dictrepr.get("name")
        doc.offset_type = dictrepr.get("offset_type")
        if doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON:
            raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
        annsets = {name: AnnotationSet.from_dict(adict, owner_doc=doc)
                   for name, adict in dictrepr.get("annotation_sets").items()}
        doc._annotation_sets = annsets
        return doc

    def save(self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", **kwargs):
        """Save the document to the destination file.

        Args:
          destination: either a file name or something that has a write(string) method.
          fmt: serialization format, by default the format is inferred from the file extension.
          offset_type: store using the given offset type or keep the current if None (Default value = None)
          mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional parameters for the document saver.
          **kwargs: 

        Returns:

        """
        if fmt is None or isinstance(fmt, str):
            m = importlib.import_module(mod)
            saver = m.get_document_saver(destination, fmt)
            saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
        else:
            # assume fmt is a callable to get used directly
            fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)

    def save_mem(self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs):
        """Serialize to a string or bytes in the given format.

        Args:
          fmt: serialization format to use. (Default value = "json")
          offset_type: store using the given offset type or keep the current if None (Default value = None)
          mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional parameters for the format.
          **kwargs: 

        Returns:

        """
        if not fmt:
            raise Exception("Format required.")
        if isinstance(fmt, str):
            m = importlib.import_module(mod)
            saver = m.get_document_saver(None, fmt)
            return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
        else:
            fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs)

    @staticmethod
    def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs):
        """Load or import a document from the given source. The source can be a file path or file name or
        a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated
        as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to
        deliberately use URL instead of a file parse the URL using urllib.
        
        Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)`
        
        Example: `Document.load(pathlib.Path(somepath), fmt=theformat)`
        
        NOTE: the offset type of the document is always converted to PYTHON when loading!

        Args:
          source: the URL or file path to load from.
          fmt: the format of the source. By default the format is inferred by the file extension.
        The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs".
          mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional format specific keyword arguments to pass to the loader
          **kwargs: 

        Returns:
          the loaded document

        """
        if fmt is None or isinstance(fmt, str):
            m = importlib.import_module(mod)
            loader = m.get_document_loader(source, fmt)
            doc = loader(Document, from_ext=source, **kwargs)
        else:
            doc = fmt(Document, from_ext=source, **kwargs)
        if doc.offset_type == OFFSET_TYPE_JAVA:
            doc.to_offset_type(OFFSET_TYPE_PYTHON)
        return doc

    @staticmethod
    def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs):
        """Create a document from the in-memory serialization in source. Source can be a string or
        bytes, depending on the format.
        
        Note: the offset type is always converted to PYTHON when loading!

        Args:
          source: the string/bytes to deserialize
          fmt: the format (Default value = "json")
          mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
          kwargs: additional arguments to pass to the loader
          **kwargs: 

        Returns:

        """
        if not fmt:
            raise Exception("Format required.")
        if isinstance(fmt, str):
            m = importlib.import_module(mod)
            loader = m.get_document_loader(None, fmt)
            doc = loader(Document, from_mem=source, **kwargs)
        else:
            doc = fmt(Document, from_mem=source, **kwargs)
        if doc.offset_type == OFFSET_TYPE_JAVA:
            doc.to_offset_type(OFFSET_TYPE_PYTHON)
        return doc

    def __copy__(self):
        """
        Creates a shallow copy except the changelog which is set to None.

        :return: shallow copy of the document
        """
        doc = Document(self._text)
        doc._annotation_sets = self._annotation_sets
        doc.offset_type = self.offset_type
        doc._features = self._features.copy()
        return doc

    def copy(self):
        """Creates a shallow copy except the changelog which is set to None.
        
        :return: shallow copy of the document

        Args:

        Returns:

        """
        return self.__copy__()

    def __deepcopy__(self, memo):
        """
        Creates a deep copy, except the changelog which is set to None.

        :param memo: the memoization dictionary to use.

        :return: a deep copy of the document.
        """
        if self._features is not None:
            fts = lib_copy.deepcopy(self._features.to_dict(), memo)
        else:
            fts = None
        doc = Document(self._text, features=fts)
        doc._changelog = None
        doc._annotation_sets = lib_copy.deepcopy(self._annotation_sets, memo)
        doc.offset_type = self.offset_type
        return doc

    def deepcopy(self, memo=None):
        """Creates a deep copy, except the changelog which is set to None.

        Args:
          memo: the memoization dictionary to use.

        Returns:
          a deep copy of the document.

        """
        return lib_copy.deepcopy(self, memo=memo)

    def _repr_html_(self):
        """
        Render function for Jupyter notebooks. Returns the html-ann-viewer HTML.
        This renders the HTML for notebook, for offline mode, but does not add the JS
        but instead initializes the JS in the notebook unless gatenlp.init_notebook()
        has bee called already.
        """
        return self._notebook_show()

    def notebook_show(self, htmlid=None):
        """
        Show the document in a Jupyter notebook. This allows to assign a specific htmlid so
        the generated HTML can be directly styled afterwards.
        This directly sends the rendered document to the cell (no display/HTML necessary).

        Args:
            htmlid: the HTML id prefix to use for classes and element ids.

        """
        self._notebook_show(htmlid=htmlid, display=True)

    def _notebook_show(self, htmlid=None, display=False):
        from gatenlp.gatenlpconfig import gatenlpconfig
        from gatenlp.serialization.default import HtmlAnnViewerSerializer
        from IPython.display import display_html
        if not gatenlpconfig.notebook_js_initialized:
            HtmlAnnViewerSerializer.init_javscript()
            gatenlpconfig.notebook_js_initialized = True
        html = self.save_mem(fmt="html-ann-viewer",
                             notebook=True,
                             add_js = False,
                             offline=True,
                             htmlid=htmlid)
        if display:
            display_html(html, raw=True)
        else:
            return html



class MultiDocument(Document):
    """
    NOTE: This is not implemented fully yet!

    A MultiDocument can store more than one document, each identified by their ids. One of those
    documents is always the "active" one and the MultiDocument can be used just like a Document
    with that content. In addition, there are methods to make each of the other documents active
    and to create mappings between annotations of pairs of documents.

    An AnnotationMapping is something that maps annotations to annotations, either for the same
    document, from the same or different sets, of for different documents. Once an annotation
    becomes part of a mapping, that annotation is becoming immutable. Even if the original annotation
    in the document changes or gets removed, the mapping retains the original copy of the annotation
    until the mapping is modified or removed.
    """
    # TODO: ALL necessary fields of the document must be references of mutable objects so that
    # if something is changed for the active document the one stored in the documents map is
    # really updated as well, or we must override the updating method to change both!
    # A better way could be to override all methods to always directly change the document in the
    # documents map, and simply pass on all calls to the activated document.
    # In that case, to_dict and from_dict would actually generate the fields for normal document
    # readers and ignore them on restore
    def __init__(self, text: str = None, features=None, changelog: ChangeLog = None, docid=0):
        self.documents = {}  # map from document id to document
        self._mappings = None   # TODO: we need to implement this
        self._docid = None
        doc = Document(text, features=features, changelog=changelog)
        self.documents[docid] = doc
        self.activate(docid)

    @property
    def docid(self):
        return self._docid

    def activate(self, docid=0):
        if docid not in self.documents:
            raise Exception(f"Cannot activate id {docid}, not in MultiDocument")
        doc = self.documents[docid]
        self._changelog = doc._changelog
        self._features = doc._features
        self._annotation_sets = doc._annotation_sets
        self._text = doc._text
        self.offset_type = OFFSET_TYPE_PYTHON
        self._name = doc._name
        self._docid = docid

    def add_document(self, doc, docid=None, activate=False):
        if docid is None:
            docid = len(self.documents)
        elif docid in self.documents:
            raise Exception(f"Cannot add document to MultiDocument, id {docid} already exists")
        self.documents[docid] = doc
        if activate:
            self.activate(docid)
        return docid

    def to_dict(self, offset_type=None, **kwargs):
        # TODO: check what to do with the offset type parameter!
        # The basic strategy is that we simply create the dictionary for the active document plus
        # the entries for the documents map and the annotation mappings. That way, any reader of the
        # dict representation which just ignored unknown fields can still read this in as a normal
        # document from the active document.
        # The drawback is that the active document is represented twice, but OK
        thedict = {
            "annotation_sets": {name: aset.to_dict() for name, aset in self._annotation_sets.items() },
            "text": self._text,
            "features": self._features.to_dict(),
            "offset_type": self.offset_type,
            "name": self.name,
        }
        thedict["documents"] = {docid: doc.to_dict() for docid, doc in self.documents.items()}
        thedict["docid"] = self._docid
        thedict["mappings"] = self._mappings
        return thedict

    @staticmethod
    def from_dict(dictrepr, **kwargs):
        feats = dictrepr.get("features")
        docid = dictrepr.get("docid")
        doc = MultiDocument(dictrepr.get("text"), features=feats, docid=docid)
        doc.name = dictrepr.get("name")
        doc.offset_type = dictrepr.get("offset_type")
        if doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON:
            raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
        annsets = {name: AnnotationSet.from_dict(adict, owner_doc=doc)
                   for name, adict in dictrepr.get("annotation_sets").items()}
        doc._annotation_sets = annsets
        doc.documents = {did: Document.from_dict(d) for did, d in dictrepr.get("documents", {}).items()}
        #mappingsrepr = dictrepr.get("mappings")
        #if mappingsrepr:
        #    doc._mappings = AnnotationMappingsOrWhatever.from_dict()
        return doc

Classes

class Document (text: str = None, features=None, changelog: ChangeLog = None)

Represent a GATE document. This is different from the original Java GATE representation in several ways:

  • the text is not mutable and can only be set at creation time, so there is no "edit" method

  • as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is no separate "FeatureMap" to store them

  • does not support listener callbacks

  • there is no separate abstraction for "content", the only content possible is text which is a unicode string that can be acessed with the "text()" method
  • Spans of text can be directly accessed using doc[from:to]
  • Features may only have string keys and values which can be json-serialised
  • Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets are UTF-16 Unicode code units
  • Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index of UTF-16 code unit and back)
  • No part of the document has to be present, not even the text (this allows saving just the annotations separately from the text)
  • Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly)

Args

text
the text of the document. The text can be None to indicate that no initial text should be set. Once

the text has been set for a document, it is immutable and cannot be changed. features: the initial document features to set, a sequence of key/value tuples changelog: a ChangeLog instance to use to log changes.

Returns:

Expand source code
class Document:
    """Represent a GATE document. This is different from the original Java GATE representation in several ways:
    
    * the text is not mutable and can only be set at creation time, so there is no "edit" method
    
    * as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is
      no separate "FeatureMap" to store them
    
    * does not support listener callbacks
    * there is no separate abstraction for "content", the only content possible is text which is a unicode string
      that can be acessed with the "text()" method
    * Spans of text can be directly accessed using doc[from:to]
    * Features may only have string keys and values which can be json-serialised
    * Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets
      are UTF-16 Unicode code units
    * Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index
      of UTF-16 code unit and back)
    * No part of the document has to be present, not even the text (this allows saving just the annotations separately
      from the text)
    * Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly)

    Args:
      text: the text of the document. The text can be None to indicate that no initial text should be set. Once
    the text has been set for a document, it is immutable and cannot be changed.
      features: the initial document features to set, a sequence of key/value tuples
      changelog: a ChangeLog instance to use to log changes.

    Returns:

    """

    def __init__(self, text: str = None, features=None, changelog: ChangeLog = None):
        if text is not None:
            assert isinstance(text, str)
        if changelog is not None:
            assert isinstance(changelog, ChangeLog)
        self._changelog = changelog
        self._features = Features(features, logger=self._log_feature_change)
        self._annotation_sets = dict()
        self._text = text
        self.offset_type = OFFSET_TYPE_PYTHON
        self._name = ""

    @property
    def name(self):
        """ """
        return self._name

    @name.setter
    def name(self, val):
        """

        Args:
          val: 

        Returns:

        """
        if val is None:
            val = ""
        if not isinstance(val, str):
            raise Exception("Name must be a string")
        self._name = val
        if self._changelog is not None:
            ch = {"command": "name:set"}
            ch["name"] = val
            self._changelog.append(ch)

    def _ensure_type_python(self) -> None:
        """ """
        if self.offset_type != OFFSET_TYPE_PYTHON:
            raise Exception("Document cannot be used if it is not type PYTHON, use to_type(OFFSET_TYPE_PYTHON) first")

    def _fixup_annotations(self, method: Callable) -> None:
        """

        Args:
          method: Callable: 

        Returns:

        """
        annset_names = self._annotation_sets.keys()
        for annset_name in annset_names:
            annset = self._annotation_sets[annset_name]
            if annset._annotations is not None:
                for ann in annset._annotations.values():
                    ann._start = method(ann._start)
                    ann._end = method(ann._end)

    def to_offset_type(self, offsettype: str) -> OffsetMapper:
        """Convert all the offsets of all the annotations in this document to the
        required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets
        are already of that type, this does nothing.
        
        NOTE: if the document has a ChangeLog, it is NOT also converted!
        
        The method returns the offset mapper if anything actually was converted,
        otherwise None.

        Args:
          offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
          offsettype: str: 

        Returns:
          offset mapper or None

        """
        om = None
        if offsettype == self.offset_type:
            return
        if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON:
            # convert from currently python to java
            om = OffsetMapper(self._text)
            self._fixup_annotations(om.convert_to_java)
            self.offset_type = OFFSET_TYPE_JAVA
        elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA:
            # convert from currently java to python
            om = OffsetMapper(self._text)
            self._fixup_annotations(om.convert_to_python)
            self.offset_type = OFFSET_TYPE_PYTHON
        else:
            raise Exception("Odd offset type")
        return om

    def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID):
        """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance,
        a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.
        
        The document is modified in-place.

        Args:
          changes: one or more changes
          handle_existing_anns: what to do if the change from the changelog tries to add an annotation
        with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)

        Returns:

        """
        if isinstance(changes, dict):
            changes = [changes]
        elif isinstance(changes, ChangeLog):
            changes = changes.changes
        for change in changes:
            cmd = change.get("command")
            fname = change.get("feature")
            fvalue = change.get("value")
            features = change.get("features")
            sname = change.get("set")
            annid = change.get("id")
            if cmd is None:
                raise Exception("Change without field 'command'")
            if cmd == ACTION_ADD_ANNSET:
                assert sname is not None
                self.annset(sname)
            elif cmd == ACTION_ADD_ANN:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                start = change.get("start")
                end = change.get("end")
                anntype = change.get("type")

                if ann is None:
                    anns.add(start, end, anntype, annid=annid, features=features)
                else:
                    if handle_existing_anns == ADDANN_IGNORE:
                        pass
                    elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID:
                        anns.add(start, end, anntype)
                    elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION:
                        anns.remove(annid)
                        anns.add(start, end, anntype, annid)
                    elif handle_existing_anns == ADDANN_UPDATE_FEATURES:
                        ann.features.update(features)
                    elif handle_existing_anns == ADDANN_REPLACE_FEATURES:
                        ann.features.clear()
                        ann.features.update(features)
                    elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES:
                        fns = ann.feature_names()
                        for f in features.keys():
                            if f not in fns:
                                ann.features[f] = features[f]

            elif cmd == ACTION_CLEAR_ANNS:
                assert sname is not None
                anns = self.annset(sname)
                anns.clear()
            elif cmd == ACTION_CLEAR_ANN_FEATURES:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                if ann is not None:
                    ann.features.clear()
                else:
                    pass # ignore, could happen with a detached annotation
            elif cmd == ACTION_CLEAR_DOC_FEATURES:
                self.features.clear()
            elif cmd == ACTION_SET_ANN_FEATURE:
                assert fname is not None
                assert sname is not None
                assert annid is not None
                ann = self.annset(sname).get(annid)
                ann.features[fname] = fvalue
            elif cmd == ACTION_DEL_ANN_FEATURE:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                ann = anns.get(annid)
                if ann is not None:
                    if fname is not None:
                        ann.features.pop(fname, None)
                else:
                    pass  # ignore, could happen with a detached annotation
            elif cmd == ACTION_DEL_DOC_FEATURE:
                assert fname is not None
                self.features.pop(fname, None)
            elif cmd == ACTION_DEL_ANN:
                assert sname is not None
                assert annid is not None
                anns = self.annset(sname)
                anns.remove(annid)
            elif cmd == ACTION_SET_DOC_FEATURE:
                assert fname is not None
                self.features[fname] = fvalue
            elif cmd == ACTION_CLEAR_DOC_FEATURES:
                self._features.clear()
            elif cmd == ACTION_DEL_DOC_FEATURE:
                assert fname is not None
                del self._features[fname]
            else:
                raise Exception("Unknown ChangeLog action: ", cmd)

    @property
    def features(self):
        """Accesses the features as a FeatureViewer instance. Changes made on this object are
        reflected in the document and recorded in the change log, if there is one.
        
        :return: A FeatureViewer view of the document features.

        Args:

        Returns:

        """
        return self._features


    @property
    def changelog(self):
        """Get the ChangeLog or None if no ChangeLog has been set.
        
        :return: the changelog

        Args:

        Returns:

        """
        return self._changelog

    @changelog.setter
    def changelog(self, chlog):
        """Make the document use the given changelog to record all changes
        from this moment on.

        Args:
          chlog: the new changelog to use or None to not use any

        Returns:
          the changelog used previously or None

        """
        oldchlog = self._changelog
        self._changelog = chlog
        return oldchlog

    @property
    def text(self) -> str:
        """Get the text of the document. For a partial document, the text may be None.
        
        :return: the text of the document

        Args:

        Returns:

        """
        self._ensure_type_python()
        return self._text

    @text.setter
    def text(self, value: str) -> None:
        """Set the text of the document. This is only possible as long as it has not been set
        yet, after that, the text is immutable.

        Args:
          value: the text for the document
          value: str: 

        Returns:

        """
        if self._text is None:
            self._text = value
        else:
            raise NotImplementedError("Text cannot be modified")

    def _log_feature_change(self, command: str, feature: str = None, value=None) -> None:
        """

        Args:
          command: str: 
          feature: str:  (Default value = None)
          value:  (Default value = None)

        Returns:

        """
        if self._changelog is None:
            return
        command = "doc-"+command
        ch = {"command": command}
        if command == "doc-feature:set":
            ch["feature"] = feature
            ch["value"] = value
        self._changelog.append(ch)

    def __len__(self) -> int:
        """
        Return the length of the text.
        Note: this will convert the type of the document to python!

        :return: the length of the document text
        """
        self._ensure_type_python()
        if self._text is None:
            return 0
        else:
            return len(self._text)

    def __getitem__(self, span) -> str:
        """
        Get the text for the given span.

        :param span: a single number, an offset range of the form from:to or an annotation.
        If annotation, uses the annotation's offset span.
        :return: the text of the span
        """
        self._ensure_type_python()
        if isinstance(span, Annotation):
            return self.text[span._start:span._end]
        if isinstance(span, AnnotationSet):
            return self.text[span.start():span.end()]
        return self.text[span]

    def annset(self, name: str = "") -> AnnotationSet:
        """Get the named annotation set, if name is not given or the empty string, the default annotation set.
        If the annotation set does not already exist, it is created.

        Args:
          name: the annotation set name, the empty string is used for the "default annotation set".
          name: str:  (Default value = "")

        Returns:
          the specified annotation set.

        """
        self._ensure_type_python()
        if name not in self._annotation_sets:
            annset = AnnotationSet(owner_doc=self, name=name)
            self._annotation_sets[name] = annset
            if self._changelog:
                self._changelog.append({
                    "command": "annotations:add",
                    "set": name})
            return annset
        else:
            return self._annotation_sets[name]

    def annset_names(self) -> KeysView[str]:
        """

        Args:

        Returns:
          :return: annotation set names

        """
        self._ensure_type_python()
        return list(self._annotation_sets.keys())
    
    def remove_annset(self, name: str):
        """Completely remove the annotation set.

        Args:
          name: name of the annotation set to remove
          name: str: 

        Returns:

        """
        if name not in self._annotation_sets:
            raise Exception(f"AnnotationSet with name {name} does not exist")
        del self._annotation_sets[name]
        if self._changelog:
            self._changelog.append({
                "command": "annotations:remove",
                "set": name})

    def __repr__(self) -> str:
        """
        String representation of the document, showing all content.

        :return: string representation
        """
        return "Document({},features={},anns={})".format(self.text, self._features, self._annotation_sets.__repr__())

    def __str__(self) -> str:
        asets = "["+",".join([f"'{k}':{len(v)}" for k, v in self._annotation_sets.items()])+"]"
        return "Document({},features={},anns={})".format(self.text, self._features, asets)

    def to_dict(self, offset_type=None, **kwargs):
        """Convert this instance to a dictionary that can be used to re-create the instance with
        from_dict.
        NOTE: if there is an active changelog, it is not included in the output as this
        field is considered a transient field!

        Args:
          offset_type: convert to the given offset type on the fly (Default value = None)
          **kwargs: 

        Returns:
          the dictionary representation of this instance

        """
        # if the specified offset type is equal to what we have, do nothing, otherwise
        # create an offset mapper and pass it down to where we actually convert the annotations

        om = None
        if offset_type is not None:
            assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON
            if offset_type != self.offset_type:
                if self._text is not None:
                    om = OffsetMapper(self._text)
                    kwargs["offset_mapper"] = om
                    kwargs["offset_type"] = offset_type
        else:
            offset_type = self.offset_type

        return {
            "annotation_sets": {name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() },
            "text": self._text,
            "features": self._features.to_dict(),
            "offset_type": offset_type,
            "name": self.name,
        }

    @staticmethod
    def from_dict(dictrepr, **kwargs):
        """Return a Document instance as represented by the dictionary dictrepr.

        Args:
          dictrepr: return: the initialized Document instance
          **kwargs:

        Returns:
          the initialized Document instance

        """
        feats = dictrepr.get("features")
        doc = Document(dictrepr.get("text"), features=feats)
        doc.name = dictrepr.get("name")
        doc.offset_type = dictrepr.get("offset_type")
        if doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON:
            raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
        annsets = {name: AnnotationSet.from_dict(adict, owner_doc=doc)
                   for name, adict in dictrepr.get("annotation_sets").items()}
        doc._annotation_sets = annsets
        return doc

    def save(self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", **kwargs):
        """Save the document to the destination file.

        Args:
          destination: either a file name or something that has a write(string) method.
          fmt: serialization format, by default the format is inferred from the file extension.
          offset_type: store using the given offset type or keep the current if None (Default value = None)
          mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional parameters for the document saver.
          **kwargs: 

        Returns:

        """
        if fmt is None or isinstance(fmt, str):
            m = importlib.import_module(mod)
            saver = m.get_document_saver(destination, fmt)
            saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
        else:
            # assume fmt is a callable to get used directly
            fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)

    def save_mem(self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs):
        """Serialize to a string or bytes in the given format.

        Args:
          fmt: serialization format to use. (Default value = "json")
          offset_type: store using the given offset type or keep the current if None (Default value = None)
          mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional parameters for the format.
          **kwargs: 

        Returns:

        """
        if not fmt:
            raise Exception("Format required.")
        if isinstance(fmt, str):
            m = importlib.import_module(mod)
            saver = m.get_document_saver(None, fmt)
            return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
        else:
            fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs)

    @staticmethod
    def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs):
        """Load or import a document from the given source. The source can be a file path or file name or
        a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated
        as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to
        deliberately use URL instead of a file parse the URL using urllib.
        
        Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)`
        
        Example: `Document.load(pathlib.Path(somepath), fmt=theformat)`
        
        NOTE: the offset type of the document is always converted to PYTHON when loading!

        Args:
          source: the URL or file path to load from.
          fmt: the format of the source. By default the format is inferred by the file extension.
        The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs".
          mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default")
          kwargs: additional format specific keyword arguments to pass to the loader
          **kwargs: 

        Returns:
          the loaded document

        """
        if fmt is None or isinstance(fmt, str):
            m = importlib.import_module(mod)
            loader = m.get_document_loader(source, fmt)
            doc = loader(Document, from_ext=source, **kwargs)
        else:
            doc = fmt(Document, from_ext=source, **kwargs)
        if doc.offset_type == OFFSET_TYPE_JAVA:
            doc.to_offset_type(OFFSET_TYPE_PYTHON)
        return doc

    @staticmethod
    def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs):
        """Create a document from the in-memory serialization in source. Source can be a string or
        bytes, depending on the format.
        
        Note: the offset type is always converted to PYTHON when loading!

        Args:
          source: the string/bytes to deserialize
          fmt: the format (Default value = "json")
          mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
          kwargs: additional arguments to pass to the loader
          **kwargs: 

        Returns:

        """
        if not fmt:
            raise Exception("Format required.")
        if isinstance(fmt, str):
            m = importlib.import_module(mod)
            loader = m.get_document_loader(None, fmt)
            doc = loader(Document, from_mem=source, **kwargs)
        else:
            doc = fmt(Document, from_mem=source, **kwargs)
        if doc.offset_type == OFFSET_TYPE_JAVA:
            doc.to_offset_type(OFFSET_TYPE_PYTHON)
        return doc

    def __copy__(self):
        """
        Creates a shallow copy except the changelog which is set to None.

        :return: shallow copy of the document
        """
        doc = Document(self._text)
        doc._annotation_sets = self._annotation_sets
        doc.offset_type = self.offset_type
        doc._features = self._features.copy()
        return doc

    def copy(self):
        """Creates a shallow copy except the changelog which is set to None.
        
        :return: shallow copy of the document

        Args:

        Returns:

        """
        return self.__copy__()

    def __deepcopy__(self, memo):
        """
        Creates a deep copy, except the changelog which is set to None.

        :param memo: the memoization dictionary to use.

        :return: a deep copy of the document.
        """
        if self._features is not None:
            fts = lib_copy.deepcopy(self._features.to_dict(), memo)
        else:
            fts = None
        doc = Document(self._text, features=fts)
        doc._changelog = None
        doc._annotation_sets = lib_copy.deepcopy(self._annotation_sets, memo)
        doc.offset_type = self.offset_type
        return doc

    def deepcopy(self, memo=None):
        """Creates a deep copy, except the changelog which is set to None.

        Args:
          memo: the memoization dictionary to use.

        Returns:
          a deep copy of the document.

        """
        return lib_copy.deepcopy(self, memo=memo)

    def _repr_html_(self):
        """
        Render function for Jupyter notebooks. Returns the html-ann-viewer HTML.
        This renders the HTML for notebook, for offline mode, but does not add the JS
        but instead initializes the JS in the notebook unless gatenlp.init_notebook()
        has bee called already.
        """
        return self._notebook_show()

    def notebook_show(self, htmlid=None):
        """
        Show the document in a Jupyter notebook. This allows to assign a specific htmlid so
        the generated HTML can be directly styled afterwards.
        This directly sends the rendered document to the cell (no display/HTML necessary).

        Args:
            htmlid: the HTML id prefix to use for classes and element ids.

        """
        self._notebook_show(htmlid=htmlid, display=True)

    def _notebook_show(self, htmlid=None, display=False):
        from gatenlp.gatenlpconfig import gatenlpconfig
        from gatenlp.serialization.default import HtmlAnnViewerSerializer
        from IPython.display import display_html
        if not gatenlpconfig.notebook_js_initialized:
            HtmlAnnViewerSerializer.init_javscript()
            gatenlpconfig.notebook_js_initialized = True
        html = self.save_mem(fmt="html-ann-viewer",
                             notebook=True,
                             add_js = False,
                             offline=True,
                             htmlid=htmlid)
        if display:
            display_html(html, raw=True)
        else:
            return html

Subclasses

Static methods

def from_dict(dictrepr, **kwargs)

Return a Document instance as represented by the dictionary dictrepr.

Args

dictrepr
return: the initialized Document instance

**kwargs:

Returns

the initialized Document instance

Expand source code
@staticmethod
def from_dict(dictrepr, **kwargs):
    """Return a Document instance as represented by the dictionary dictrepr.

    Args:
      dictrepr: return: the initialized Document instance
      **kwargs:

    Returns:
      the initialized Document instance

    """
    feats = dictrepr.get("features")
    doc = Document(dictrepr.get("text"), features=feats)
    doc.name = dictrepr.get("name")
    doc.offset_type = dictrepr.get("offset_type")
    if doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON:
        raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
    annsets = {name: AnnotationSet.from_dict(adict, owner_doc=doc)
               for name, adict in dictrepr.get("annotation_sets").items()}
    doc._annotation_sets = annsets
    return doc
def load(source, fmt=None, mod='gatenlp.serialization.default', **kwargs)

Load or import a document from the given source. The source can be a file path or file name or a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse the URL using urllib.

Example: Document.load(urllib.parse.urlparse(someurl), fmt=theformat)

Example: Document.load(pathlib.Path(somepath), fmt=theformat)

NOTE: the offset type of the document is always converted to PYTHON when loading!

Args

source
the URL or file path to load from.
fmt
the format of the source. By default the format is inferred by the file extension.

The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs". mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional format specific keyword arguments to pass to the loader **kwargs:

Returns

the loaded document

Expand source code
@staticmethod
def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs):
    """Load or import a document from the given source. The source can be a file path or file name or
    a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated
    as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to
    deliberately use URL instead of a file parse the URL using urllib.
    
    Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)`
    
    Example: `Document.load(pathlib.Path(somepath), fmt=theformat)`
    
    NOTE: the offset type of the document is always converted to PYTHON when loading!

    Args:
      source: the URL or file path to load from.
      fmt: the format of the source. By default the format is inferred by the file extension.
    The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs".
      mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default")
      kwargs: additional format specific keyword arguments to pass to the loader
      **kwargs: 

    Returns:
      the loaded document

    """
    if fmt is None or isinstance(fmt, str):
        m = importlib.import_module(mod)
        loader = m.get_document_loader(source, fmt)
        doc = loader(Document, from_ext=source, **kwargs)
    else:
        doc = fmt(Document, from_ext=source, **kwargs)
    if doc.offset_type == OFFSET_TYPE_JAVA:
        doc.to_offset_type(OFFSET_TYPE_PYTHON)
    return doc
def load_mem(source, fmt='json', mod='gatenlp.serialization.default', **kwargs)

Create a document from the in-memory serialization in source. Source can be a string or bytes, depending on the format.

Note: the offset type is always converted to PYTHON when loading!

Args

source
the string/bytes to deserialize
fmt
the format (Default value = "json")
mod
the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
kwargs
additional arguments to pass to the loader
**kwargs
 

Returns:

Expand source code
@staticmethod
def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs):
    """Create a document from the in-memory serialization in source. Source can be a string or
    bytes, depending on the format.
    
    Note: the offset type is always converted to PYTHON when loading!

    Args:
      source: the string/bytes to deserialize
      fmt: the format (Default value = "json")
      mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
      kwargs: additional arguments to pass to the loader
      **kwargs: 

    Returns:

    """
    if not fmt:
        raise Exception("Format required.")
    if isinstance(fmt, str):
        m = importlib.import_module(mod)
        loader = m.get_document_loader(None, fmt)
        doc = loader(Document, from_mem=source, **kwargs)
    else:
        doc = fmt(Document, from_mem=source, **kwargs)
    if doc.offset_type == OFFSET_TYPE_JAVA:
        doc.to_offset_type(OFFSET_TYPE_PYTHON)
    return doc

Instance variables

property/get/set changelog

Get the ChangeLog or None if no ChangeLog has been set.

:return: the changelog

Args:

Returns:

Expand source code
@property
def changelog(self):
    """Get the ChangeLog or None if no ChangeLog has been set.
    
    :return: the changelog

    Args:

    Returns:

    """
    return self._changelog
property/get features

Accesses the features as a FeatureViewer instance. Changes made on this object are reflected in the document and recorded in the change log, if there is one.

:return: A FeatureViewer view of the document features.

Args:

Returns:

Expand source code
@property
def features(self):
    """Accesses the features as a FeatureViewer instance. Changes made on this object are
    reflected in the document and recorded in the change log, if there is one.
    
    :return: A FeatureViewer view of the document features.

    Args:

    Returns:

    """
    return self._features
property/get/set name
Expand source code
@property
def name(self):
    """ """
    return self._name
property/get/set text : str

Get the text of the document. For a partial document, the text may be None.

:return: the text of the document

Args:

Returns:

Expand source code
@property
def text(self) -> str:
    """Get the text of the document. For a partial document, the text may be None.
    
    :return: the text of the document

    Args:

    Returns:

    """
    self._ensure_type_python()
    return self._text

Methods

def annset(self, name: str = '') ‑> AnnotationSet

Get the named annotation set, if name is not given or the empty string, the default annotation set. If the annotation set does not already exist, it is created.

Args

name
the annotation set name, the empty string is used for the "default annotation set".
name
str: (Default value = "")

Returns

the specified annotation set.

Expand source code
def annset(self, name: str = "") -> AnnotationSet:
    """Get the named annotation set, if name is not given or the empty string, the default annotation set.
    If the annotation set does not already exist, it is created.

    Args:
      name: the annotation set name, the empty string is used for the "default annotation set".
      name: str:  (Default value = "")

    Returns:
      the specified annotation set.

    """
    self._ensure_type_python()
    if name not in self._annotation_sets:
        annset = AnnotationSet(owner_doc=self, name=name)
        self._annotation_sets[name] = annset
        if self._changelog:
            self._changelog.append({
                "command": "annotations:add",
                "set": name})
        return annset
    else:
        return self._annotation_sets[name]
def annset_names(self) ‑> KeysView[str]

Args:

Returns

:return: annotation set names

Expand source code
def annset_names(self) -> KeysView[str]:
    """

    Args:

    Returns:
      :return: annotation set names

    """
    self._ensure_type_python()
    return list(self._annotation_sets.keys())
def apply_changes(self, changes, handle_existing_anns='add-with-new-id')

Apply changes from a ChangeLog to this document. changes can be a ChangeLog instance, a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.

The document is modified in-place.

Args

changes
one or more changes
handle_existing_anns
what to do if the change from the changelog tries to add an annotation

with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)

Returns:

Expand source code
def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID):
    """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance,
    a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.
    
    The document is modified in-place.

    Args:
      changes: one or more changes
      handle_existing_anns: what to do if the change from the changelog tries to add an annotation
    with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)

    Returns:

    """
    if isinstance(changes, dict):
        changes = [changes]
    elif isinstance(changes, ChangeLog):
        changes = changes.changes
    for change in changes:
        cmd = change.get("command")
        fname = change.get("feature")
        fvalue = change.get("value")
        features = change.get("features")
        sname = change.get("set")
        annid = change.get("id")
        if cmd is None:
            raise Exception("Change without field 'command'")
        if cmd == ACTION_ADD_ANNSET:
            assert sname is not None
            self.annset(sname)
        elif cmd == ACTION_ADD_ANN:
            assert sname is not None
            assert annid is not None
            anns = self.annset(sname)
            ann = anns.get(annid)
            start = change.get("start")
            end = change.get("end")
            anntype = change.get("type")

            if ann is None:
                anns.add(start, end, anntype, annid=annid, features=features)
            else:
                if handle_existing_anns == ADDANN_IGNORE:
                    pass
                elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID:
                    anns.add(start, end, anntype)
                elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION:
                    anns.remove(annid)
                    anns.add(start, end, anntype, annid)
                elif handle_existing_anns == ADDANN_UPDATE_FEATURES:
                    ann.features.update(features)
                elif handle_existing_anns == ADDANN_REPLACE_FEATURES:
                    ann.features.clear()
                    ann.features.update(features)
                elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES:
                    fns = ann.feature_names()
                    for f in features.keys():
                        if f not in fns:
                            ann.features[f] = features[f]

        elif cmd == ACTION_CLEAR_ANNS:
            assert sname is not None
            anns = self.annset(sname)
            anns.clear()
        elif cmd == ACTION_CLEAR_ANN_FEATURES:
            assert sname is not None
            assert annid is not None
            anns = self.annset(sname)
            ann = anns.get(annid)
            if ann is not None:
                ann.features.clear()
            else:
                pass # ignore, could happen with a detached annotation
        elif cmd == ACTION_CLEAR_DOC_FEATURES:
            self.features.clear()
        elif cmd == ACTION_SET_ANN_FEATURE:
            assert fname is not None
            assert sname is not None
            assert annid is not None
            ann = self.annset(sname).get(annid)
            ann.features[fname] = fvalue
        elif cmd == ACTION_DEL_ANN_FEATURE:
            assert sname is not None
            assert annid is not None
            anns = self.annset(sname)
            ann = anns.get(annid)
            if ann is not None:
                if fname is not None:
                    ann.features.pop(fname, None)
            else:
                pass  # ignore, could happen with a detached annotation
        elif cmd == ACTION_DEL_DOC_FEATURE:
            assert fname is not None
            self.features.pop(fname, None)
        elif cmd == ACTION_DEL_ANN:
            assert sname is not None
            assert annid is not None
            anns = self.annset(sname)
            anns.remove(annid)
        elif cmd == ACTION_SET_DOC_FEATURE:
            assert fname is not None
            self.features[fname] = fvalue
        elif cmd == ACTION_CLEAR_DOC_FEATURES:
            self._features.clear()
        elif cmd == ACTION_DEL_DOC_FEATURE:
            assert fname is not None
            del self._features[fname]
        else:
            raise Exception("Unknown ChangeLog action: ", cmd)
def copy(self)

Creates a shallow copy except the changelog which is set to None.

:return: shallow copy of the document

Args:

Returns:

Expand source code
def copy(self):
    """Creates a shallow copy except the changelog which is set to None.
    
    :return: shallow copy of the document

    Args:

    Returns:

    """
    return self.__copy__()
def deepcopy(self, memo=None)

Creates a deep copy, except the changelog which is set to None.

Args

memo
the memoization dictionary to use.

Returns

a deep copy of the document.

Expand source code
def deepcopy(self, memo=None):
    """Creates a deep copy, except the changelog which is set to None.

    Args:
      memo: the memoization dictionary to use.

    Returns:
      a deep copy of the document.

    """
    return lib_copy.deepcopy(self, memo=memo)
def notebook_show(self, htmlid=None)

Show the document in a Jupyter notebook. This allows to assign a specific htmlid so the generated HTML can be directly styled afterwards. This directly sends the rendered document to the cell (no display/HTML necessary).

Args

htmlid
the HTML id prefix to use for classes and element ids.
Expand source code
def notebook_show(self, htmlid=None):
    """
    Show the document in a Jupyter notebook. This allows to assign a specific htmlid so
    the generated HTML can be directly styled afterwards.
    This directly sends the rendered document to the cell (no display/HTML necessary).

    Args:
        htmlid: the HTML id prefix to use for classes and element ids.

    """
    self._notebook_show(htmlid=htmlid, display=True)
def remove_annset(self, name: str)

Completely remove the annotation set.

Args

name
name of the annotation set to remove
name
str:

Returns:

Expand source code
def remove_annset(self, name: str):
    """Completely remove the annotation set.

    Args:
      name: name of the annotation set to remove
      name: str: 

    Returns:

    """
    if name not in self._annotation_sets:
        raise Exception(f"AnnotationSet with name {name} does not exist")
    del self._annotation_sets[name]
    if self._changelog:
        self._changelog.append({
            "command": "annotations:remove",
            "set": name})
def save(self, destination, fmt=None, offset_type=None, mod='gatenlp.serialization.default', **kwargs)

Save the document to the destination file.

Args

destination
either a file name or something that has a write(string) method.
fmt
serialization format, by default the format is inferred from the file extension.
offset_type
store using the given offset type or keep the current if None (Default value = None)
mod
module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
kwargs
additional parameters for the document saver.
**kwargs
 

Returns:

Expand source code
def save(self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", **kwargs):
    """Save the document to the destination file.

    Args:
      destination: either a file name or something that has a write(string) method.
      fmt: serialization format, by default the format is inferred from the file extension.
      offset_type: store using the given offset type or keep the current if None (Default value = None)
      mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
      kwargs: additional parameters for the document saver.
      **kwargs: 

    Returns:

    """
    if fmt is None or isinstance(fmt, str):
        m = importlib.import_module(mod)
        saver = m.get_document_saver(destination, fmt)
        saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
    else:
        # assume fmt is a callable to get used directly
        fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
def save_mem(self, fmt='json', offset_type=None, mod='gatenlp.serialization.default', **kwargs)

Serialize to a string or bytes in the given format.

Args

fmt
serialization format to use. (Default value = "json")
offset_type
store using the given offset type or keep the current if None (Default value = None)
mod
module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
kwargs
additional parameters for the format.
**kwargs
 

Returns:

Expand source code
def save_mem(self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs):
    """Serialize to a string or bytes in the given format.

    Args:
      fmt: serialization format to use. (Default value = "json")
      offset_type: store using the given offset type or keep the current if None (Default value = None)
      mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
      kwargs: additional parameters for the format.
      **kwargs: 

    Returns:

    """
    if not fmt:
        raise Exception("Format required.")
    if isinstance(fmt, str):
        m = importlib.import_module(mod)
        saver = m.get_document_saver(None, fmt)
        return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
    else:
        fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
def to_dict(self, offset_type=None, **kwargs)

Convert this instance to a dictionary that can be used to re-create the instance with from_dict. NOTE: if there is an active changelog, it is not included in the output as this field is considered a transient field!

Args

offset_type
convert to the given offset type on the fly (Default value = None)
**kwargs
 

Returns

the dictionary representation of this instance

Expand source code
def to_dict(self, offset_type=None, **kwargs):
    """Convert this instance to a dictionary that can be used to re-create the instance with
    from_dict.
    NOTE: if there is an active changelog, it is not included in the output as this
    field is considered a transient field!

    Args:
      offset_type: convert to the given offset type on the fly (Default value = None)
      **kwargs: 

    Returns:
      the dictionary representation of this instance

    """
    # if the specified offset type is equal to what we have, do nothing, otherwise
    # create an offset mapper and pass it down to where we actually convert the annotations

    om = None
    if offset_type is not None:
        assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON
        if offset_type != self.offset_type:
            if self._text is not None:
                om = OffsetMapper(self._text)
                kwargs["offset_mapper"] = om
                kwargs["offset_type"] = offset_type
    else:
        offset_type = self.offset_type

    return {
        "annotation_sets": {name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() },
        "text": self._text,
        "features": self._features.to_dict(),
        "offset_type": offset_type,
        "name": self.name,
    }
def to_offset_type(self, offsettype: str) ‑> OffsetMapper

Convert all the offsets of all the annotations in this document to the required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets are already of that type, this does nothing.

NOTE: if the document has a ChangeLog, it is NOT also converted!

The method returns the offset mapper if anything actually was converted, otherwise None.

Args

offsettype
either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
offsettype
str:

Returns

offset mapper or None

Expand source code
def to_offset_type(self, offsettype: str) -> OffsetMapper:
    """Convert all the offsets of all the annotations in this document to the
    required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets
    are already of that type, this does nothing.
    
    NOTE: if the document has a ChangeLog, it is NOT also converted!
    
    The method returns the offset mapper if anything actually was converted,
    otherwise None.

    Args:
      offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
      offsettype: str: 

    Returns:
      offset mapper or None

    """
    om = None
    if offsettype == self.offset_type:
        return
    if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON:
        # convert from currently python to java
        om = OffsetMapper(self._text)
        self._fixup_annotations(om.convert_to_java)
        self.offset_type = OFFSET_TYPE_JAVA
    elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA:
        # convert from currently java to python
        om = OffsetMapper(self._text)
        self._fixup_annotations(om.convert_to_python)
        self.offset_type = OFFSET_TYPE_PYTHON
    else:
        raise Exception("Odd offset type")
    return om
class MultiDocument (text: str = None, features=None, changelog: ChangeLog = None, docid=0)

NOTE: This is not implemented fully yet!

A MultiDocument can store more than one document, each identified by their ids. One of those documents is always the "active" one and the MultiDocument can be used just like a Document with that content. In addition, there are methods to make each of the other documents active and to create mappings between annotations of pairs of documents.

An AnnotationMapping is something that maps annotations to annotations, either for the same document, from the same or different sets, of for different documents. Once an annotation becomes part of a mapping, that annotation is becoming immutable. Even if the original annotation in the document changes or gets removed, the mapping retains the original copy of the annotation until the mapping is modified or removed.

Expand source code
class MultiDocument(Document):
    """
    NOTE: This is not implemented fully yet!

    A MultiDocument can store more than one document, each identified by their ids. One of those
    documents is always the "active" one and the MultiDocument can be used just like a Document
    with that content. In addition, there are methods to make each of the other documents active
    and to create mappings between annotations of pairs of documents.

    An AnnotationMapping is something that maps annotations to annotations, either for the same
    document, from the same or different sets, of for different documents. Once an annotation
    becomes part of a mapping, that annotation is becoming immutable. Even if the original annotation
    in the document changes or gets removed, the mapping retains the original copy of the annotation
    until the mapping is modified or removed.
    """
    # TODO: ALL necessary fields of the document must be references of mutable objects so that
    # if something is changed for the active document the one stored in the documents map is
    # really updated as well, or we must override the updating method to change both!
    # A better way could be to override all methods to always directly change the document in the
    # documents map, and simply pass on all calls to the activated document.
    # In that case, to_dict and from_dict would actually generate the fields for normal document
    # readers and ignore them on restore
    def __init__(self, text: str = None, features=None, changelog: ChangeLog = None, docid=0):
        self.documents = {}  # map from document id to document
        self._mappings = None   # TODO: we need to implement this
        self._docid = None
        doc = Document(text, features=features, changelog=changelog)
        self.documents[docid] = doc
        self.activate(docid)

    @property
    def docid(self):
        return self._docid

    def activate(self, docid=0):
        if docid not in self.documents:
            raise Exception(f"Cannot activate id {docid}, not in MultiDocument")
        doc = self.documents[docid]
        self._changelog = doc._changelog
        self._features = doc._features
        self._annotation_sets = doc._annotation_sets
        self._text = doc._text
        self.offset_type = OFFSET_TYPE_PYTHON
        self._name = doc._name
        self._docid = docid

    def add_document(self, doc, docid=None, activate=False):
        if docid is None:
            docid = len(self.documents)
        elif docid in self.documents:
            raise Exception(f"Cannot add document to MultiDocument, id {docid} already exists")
        self.documents[docid] = doc
        if activate:
            self.activate(docid)
        return docid

    def to_dict(self, offset_type=None, **kwargs):
        # TODO: check what to do with the offset type parameter!
        # The basic strategy is that we simply create the dictionary for the active document plus
        # the entries for the documents map and the annotation mappings. That way, any reader of the
        # dict representation which just ignored unknown fields can still read this in as a normal
        # document from the active document.
        # The drawback is that the active document is represented twice, but OK
        thedict = {
            "annotation_sets": {name: aset.to_dict() for name, aset in self._annotation_sets.items() },
            "text": self._text,
            "features": self._features.to_dict(),
            "offset_type": self.offset_type,
            "name": self.name,
        }
        thedict["documents"] = {docid: doc.to_dict() for docid, doc in self.documents.items()}
        thedict["docid"] = self._docid
        thedict["mappings"] = self._mappings
        return thedict

    @staticmethod
    def from_dict(dictrepr, **kwargs):
        feats = dictrepr.get("features")
        docid = dictrepr.get("docid")
        doc = MultiDocument(dictrepr.get("text"), features=feats, docid=docid)
        doc.name = dictrepr.get("name")
        doc.offset_type = dictrepr.get("offset_type")
        if doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON:
            raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
        annsets = {name: AnnotationSet.from_dict(adict, owner_doc=doc)
                   for name, adict in dictrepr.get("annotation_sets").items()}
        doc._annotation_sets = annsets
        doc.documents = {did: Document.from_dict(d) for did, d in dictrepr.get("documents", {}).items()}
        #mappingsrepr = dictrepr.get("mappings")
        #if mappingsrepr:
        #    doc._mappings = AnnotationMappingsOrWhatever.from_dict()
        return doc

Ancestors

Instance variables

property/get docid
Expand source code
@property
def docid(self):
    return self._docid

Methods

def activate(self, docid=0)
Expand source code
def activate(self, docid=0):
    if docid not in self.documents:
        raise Exception(f"Cannot activate id {docid}, not in MultiDocument")
    doc = self.documents[docid]
    self._changelog = doc._changelog
    self._features = doc._features
    self._annotation_sets = doc._annotation_sets
    self._text = doc._text
    self.offset_type = OFFSET_TYPE_PYTHON
    self._name = doc._name
    self._docid = docid
def add_document(self, doc, docid=None, activate=False)
Expand source code
def add_document(self, doc, docid=None, activate=False):
    if docid is None:
        docid = len(self.documents)
    elif docid in self.documents:
        raise Exception(f"Cannot add document to MultiDocument, id {docid} already exists")
    self.documents[docid] = doc
    if activate:
        self.activate(docid)
    return docid

Inherited members