File size: 7,356 Bytes
1d5604f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""Encapsulates all word and punctuation symbols layer.

Layer 0 is the basic layer for all the UCCA annotation, as it includes the
actual words and punctuation marks found in the :class:`core`.Passage.

Layer 0 has only one type of node, :class:`Terminal`. This is a subtype of
:class:`core`.Node, and can have one of two tags: Word or Punctuation.

"""

from ucca import core

LAYER_ID = '0'


class NodeTags:
    Punct = 'Punctuation'
    Word = 'Word'
    __init__ = None


ATTRIB_KEYS = ('text', 'paragraph', 'paragraph_position')


class Terminal(core.Node):
    """Layer 0 Node type, represents a word or a punctuation mark.

    Terminals are :class:`core`.Node objects which represent a word or
    a punctuation mark in the :class:`core`.Passage object. They are immutable,
    as they shouldn't be changed throughout their use and have no children.
    Hence, they can be compared and hashed, unlike other core.Node subclasses.

    Attributes:
        ID: the unique ID of each Terminal is its global position in the
        Passage, e.g. ID=0.4 is the 4th Terminal in the :class:`Passage`.
        tag: from NodeTags
        layer: '0' (LAYER_ID)
        attrib: returns a copy of the attribute dictionary, so changing it
            will not affect the Terminal object
        text: text of the Terminal, whether punctuation or a word
        position: global position of the Terminal in the passage, starting at 1
        paragraph: which paragraph the Terminal belongs to, starting at 1
        para_pos: the position of the Terminal in the paragraph,
            starting at 1 (per paragraph).
        punct: whether the Terminal is a punctuation mark (boolean)

    """

    @property
    def text(self):
        return self.attrib['text']

    @property
    def position(self):
        # the format of ID is LAYER_ID + ID separator + position
        return int(self.ID[len(LAYER_ID) + len(core.Node.ID_SEPARATOR):])

    @property
    def para_pos(self):
        return self.attrib['paragraph_position']

    @property
    def paragraph(self):
        return self.attrib['paragraph']

    @property
    def tok(self):
        try:
            return self.layer.extra["doc"][self.paragraph - 1][self.para_pos - 1]
        except (KeyError, IndexError):
            return None

    def get_annotation(self, attr, as_array=False):
        return attr(self.tok[attr.value]) if as_array else self.extra.get(attr.key)

    @property
    def attrib(self):
        return self._attrib.copy()

    @property
    def punct(self):
        return self.tag == NodeTags.Punct

    def get_terminals(self, punct=True, *args, **kwargs):
        """Returns a list containing just this Terminal.

        :param punct: whether to include punctuation Terminals, defaults to True

        :return: a list of :class:`layer0`.Terminal objects
        """
        del args, kwargs
        return [] if self.punct and not punct else [self]

    def equals(self, other, *, ordered=False, **kwargs):
        """Equals if the Terminals are of the same Layer, tag, position & text.

        :param other: another Terminal to equal to
        :param ordered: unused, here for API conformity.

        :return: True iff the two Terminals are equal.
        """
        return (self.layer.ID == other.layer.ID and self.text == other.text
                and self.position == other.position and self.tag == other.tag
                and self.paragraph == other.paragraph
                and self.para_pos == other.para_pos)

    def __eq__(self, other):
        """Equals if both of the same Passage, Layer, position, tag & text."""
        if other.layer.ID != LAYER_ID:
            return False
        return (self.root == other.root and self.layer.ID == other.layer.ID
                and self.position == other.position
                and self.text == other.text and self.tag == other.tag
                and self.paragraph == other.paragraph
                and self.para_pos == other.para_pos)

    def __hash__(self):
        """Hashes the Terminals according to its ID and text."""
        return hash(self.ID + str(self.text))

    def __str__(self):
        return self.text

    # Terminal are immutable (except the extra dictionary which is
    # just a temporary playground) and have no children, so enforce it
    def add(self, *args, **kwargs):
        raise NotImplementedError()

    def remove(self, *args, **kwargs):
        raise NotImplementedError()


class Layer0(core.Layer):
    """Represents the :class:`Terminal` objects layer.

    Attributes:
        words: a tuple of only the words (not punctuation) Terminals, ordered
        pairs: a tuple of (position, terminal) tuples of all Terminals, ordered

    """

    def __init__(self, root, attrib=None):
        super().__init__(ID=LAYER_ID, root=root, attrib=attrib)

    @property
    def words(self):
        return tuple(x for x in self._all if not x.punct)

    @property
    def pairs(self):
        return tuple(enumerate(self._all, start=1))

    def by_position(self, pos):
        """Returns the Terminals at the position given.

        :param pos: the position of the Terminal object
        :return: the Terminal in this position
        :raise IndexError: if the position is out of bounds
        """
        return self._all[pos - 1]  # positions start at 1, not 0

    def add_terminal(self, text, punct, paragraph=1):
        """Adds the next Terminal at the next available position.

        Creates a :class:`Terminal` object with the next position, assuming that
        all positions are filled (no holes).

        :param text: the text of the Terminal
        :param punct: boolean, whether it's a punctuation mark
        :param paragraph: paragraph number, defaults to 1

        :return: the created Terminal

        :raise DuplicateIdError: if trying to add an already existing Terminal,
                caused by un-ordered Terminal positions in the layer
        """
        position = len(self._all) + 1  # we want positions to start with 1
        para_pos = self._all[-1].para_pos + 1 if position > 1 and paragraph == self._all[-1].paragraph else 1
        tag = NodeTags.Punct if punct else NodeTags.Word
        return Terminal(ID="{}{}{}".format(LAYER_ID, core.Node.ID_SEPARATOR, position),
                        root=self.root, tag=tag,
                        attrib={'text': text,
                                'paragraph': paragraph,
                                'paragraph_position': para_pos})

    def copy(self, other_passage):
        """Creates a copied Layer0 object and Terminals in other_passage.

        :param other_passage: the Passage to copy self to

        """
        other = Layer0(root=other_passage, attrib=self.attrib.copy())
        other.extra = self.extra.copy()
        for t in self._all:
            copied = other.add_terminal(t.text, t.punct, t.paragraph)
            copied.extra = t.extra.copy()

    def docs(self, num_paragraphs=1):
        docs = self.extra.setdefault("doc", [[]])
        while len(docs) < num_paragraphs:
            docs.append([])
        return docs

    def doc(self, paragraph):
        return self.docs(paragraph)[paragraph - 1]


def is_punct(node):
    """Returns whether the unit is a layer0 punctuation (for all Units)."""
    return node.layer.ID == LAYER_ID and node.punct