summaryrefslogtreecommitdiffstats
path: root/snips_inference_agl/dataset/entity.py
blob: 65b99949e7693e4019708627401f4a88a76334c8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# coding=utf-8
from __future__ import unicode_literals

from builtins import str
from io import IOBase

from snips_inference_agl.constants import (
    AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS, SYNONYMS,
    USE_SYNONYMS, VALUE)
from snips_inference_agl.exceptions import EntityFormatError


class Entity(object):
    """Entity data of a :class:`.Dataset`

    This class can represents both a custom or a builtin entity. When the
    entity is a builtin one, only the `name` attribute is relevant.

    Attributes:
        name (str): name of the entity
        utterances (list of :class:`.EntityUtterance`): entity utterances
            (only for custom entities)
        automatically_extensible (bool): whether or not the entity can be
            extended to values not present in the data (only for custom
            entities)
        use_synonyms (bool): whether or not to map entity values using
            synonyms (only for custom entities)
        matching_strictness (float): controls the matching strictness of the
            entity (only for custom entities). Must be between 0.0 and 1.0.
    """

    def __init__(self, name, utterances=None, automatically_extensible=True,
                 use_synonyms=True, matching_strictness=1.0):
        if utterances is None:
            utterances = []
        self.name = name
        self.utterances = utterances
        self.automatically_extensible = automatically_extensible
        self.use_synonyms = use_synonyms
        self.matching_strictness = matching_strictness

    @property
    def is_builtin(self):
        from snips_nlu_parsers import get_all_builtin_entities

        return self.name in get_all_builtin_entities()

    @classmethod
    def from_yaml(cls, yaml_dict):
        """Build an :class:`.Entity` from its YAML definition object

        Args:
            yaml_dict (dict or :class:`.IOBase`): object containing the YAML
                definition of the entity. It can be either a stream, or the
                corresponding python dict.

        Examples:
            An entity can be defined with a YAML document following the schema
            illustrated in the example below:

            >>> import io
            >>> from snips_inference_agl.common.utils import json_string
            >>> entity_yaml = io.StringIO('''
            ... # City Entity
            ... ---
            ... type: entity
            ... name: city
            ... automatically_extensible: false # default value is true
            ... use_synonyms: false # default value is true
            ... matching_strictness: 0.8 # default value is 1.0
            ... values:
            ...   - london
            ...   - [new york, big apple]
            ...   - [paris, city of lights]''')
            >>> entity = Entity.from_yaml(entity_yaml)
            >>> print(json_string(entity.json, indent=4, sort_keys=True))
            {
                "automatically_extensible": false,
                "data": [
                    {
                        "synonyms": [],
                        "value": "london"
                    },
                    {
                        "synonyms": [
                            "big apple"
                        ],
                        "value": "new york"
                    },
                    {
                        "synonyms": [
                            "city of lights"
                        ],
                        "value": "paris"
                    }
                ],
                "matching_strictness": 0.8,
                "use_synonyms": false
            }

        Raises:
            EntityFormatError: When the YAML dict does not correspond to the
                :ref:`expected entity format <yaml_entity_format>`
        """
        if isinstance(yaml_dict, IOBase):
            from snips_inference_agl.dataset.yaml_wrapper import yaml

            yaml_dict = yaml.safe_load(yaml_dict)

        object_type = yaml_dict.get("type")
        if object_type and object_type != "entity":
            raise EntityFormatError("Wrong type: '%s'" % object_type)
        entity_name = yaml_dict.get("name")
        if not entity_name:
            raise EntityFormatError("Missing 'name' attribute")
        auto_extensible = yaml_dict.get(AUTOMATICALLY_EXTENSIBLE, True)
        use_synonyms = yaml_dict.get(USE_SYNONYMS, True)
        matching_strictness = yaml_dict.get("matching_strictness", 1.0)
        utterances = []
        for entity_value in yaml_dict.get("values", []):
            if isinstance(entity_value, list):
                utterance = EntityUtterance(entity_value[0], entity_value[1:])
            elif isinstance(entity_value, str):
                utterance = EntityUtterance(entity_value)
            else:
                raise EntityFormatError(
                    "YAML entity values must be either strings or lists, but "
                    "found: %s" % type(entity_value))
            utterances.append(utterance)

        return cls(name=entity_name,
                   utterances=utterances,
                   automatically_extensible=auto_extensible,
                   use_synonyms=use_synonyms,
                   matching_strictness=matching_strictness)

    @property
    def json(self):
        """Returns the entity in json format"""
        if self.is_builtin:
            return dict()
        return {
            AUTOMATICALLY_EXTENSIBLE: self.automatically_extensible,
            USE_SYNONYMS: self.use_synonyms,
            DATA: [u.json for u in self.utterances],
            MATCHING_STRICTNESS: self.matching_strictness
        }


class EntityUtterance(object):
    """Represents a value of a :class:`.CustomEntity` with potential synonyms

    Attributes:
        value (str): entity value
        synonyms (list of str): The values to remap to the utterance value
        """

    def __init__(self, value, synonyms=None):
        self.value = value
        if synonyms is None:
            synonyms = []
        self.synonyms = synonyms

    @property
    def variations(self):
        return [self.value] + self.synonyms

    @property
    def json(self):
        return {VALUE: self.value, SYNONYMS: self.synonyms}


def utf_8_encoder(f):
    for line in f:
        yield line.encode("utf-8")