aboutsummaryrefslogtreecommitdiffstats
path: root/snips_inference_agl/pipeline/configs/intent_classifier.py
blob: fc22c87f650a8a7b8b09d28bab805e00f7467032 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
from __future__ import unicode_literals

from snips_inference_agl.common.from_dict import FromDict
from snips_inference_agl.constants import (
    CUSTOM_ENTITY_PARSER_USAGE, NOISE, STEMS, STOP_WORDS, WORD_CLUSTERS)
from snips_inference_agl.entity_parser.custom_entity_parser import (
    CustomEntityParserUsage)
from snips_inference_agl.pipeline.configs import Config, ProcessingUnitConfig
from snips_inference_agl.resources import merge_required_resources


class LogRegIntentClassifierConfig(FromDict, ProcessingUnitConfig):
    """Configuration of a :class:`.LogRegIntentClassifier`"""

    # pylint: disable=line-too-long
    def __init__(self, data_augmentation_config=None, featurizer_config=None,
                 noise_reweight_factor=1.0):
        """
        Args:
            data_augmentation_config (:class:`IntentClassifierDataAugmentationConfig`):
                    Defines the strategy of the underlying data augmentation
            featurizer_config (:class:`FeaturizerConfig`): Configuration of the
                :class:`.Featurizer` used underneath
            noise_reweight_factor (float, optional): this parameter allows to
                change the weight of the None class. By default, the class
                weights are computed using a "balanced" strategy. The
                noise_reweight_factor allows to deviate from this strategy.
        """
        if data_augmentation_config is None:
            data_augmentation_config = IntentClassifierDataAugmentationConfig()
        if featurizer_config is None:
            featurizer_config = FeaturizerConfig()
        self._data_augmentation_config = None
        self.data_augmentation_config = data_augmentation_config
        self._featurizer_config = None
        self.featurizer_config = featurizer_config
        self.noise_reweight_factor = noise_reweight_factor

    # pylint: enable=line-too-long

    @property
    def data_augmentation_config(self):
        return self._data_augmentation_config

    @data_augmentation_config.setter
    def data_augmentation_config(self, value):
        if isinstance(value, dict):
            self._data_augmentation_config = \
                IntentClassifierDataAugmentationConfig.from_dict(value)
        elif isinstance(value, IntentClassifierDataAugmentationConfig):
            self._data_augmentation_config = value
        else:
            raise TypeError("Expected instance of "
                            "IntentClassifierDataAugmentationConfig or dict"
                            "but received: %s" % type(value))

    @property
    def featurizer_config(self):
        return self._featurizer_config

    @featurizer_config.setter
    def featurizer_config(self, value):
        if isinstance(value, dict):
            self._featurizer_config = \
                FeaturizerConfig.from_dict(value)
        elif isinstance(value, FeaturizerConfig):
            self._featurizer_config = value
        else:
            raise TypeError("Expected instance of FeaturizerConfig or dict"
                            "but received: %s" % type(value))

    @property
    def unit_name(self):
        from snips_inference_agl.intent_classifier import LogRegIntentClassifier
        return LogRegIntentClassifier.unit_name

    def get_required_resources(self):
        resources = self.data_augmentation_config.get_required_resources()
        resources = merge_required_resources(
            resources, self.featurizer_config.get_required_resources())
        return resources

    def to_dict(self):
        return {
            "unit_name": self.unit_name,
            "data_augmentation_config":
                self.data_augmentation_config.to_dict(),
            "featurizer_config": self.featurizer_config.to_dict(),
            "noise_reweight_factor": self.noise_reweight_factor,
        }


class IntentClassifierDataAugmentationConfig(FromDict, Config):
    """Configuration used by a :class:`.LogRegIntentClassifier` which defines
        how to augment data to improve the training of the classifier"""

    def __init__(self, min_utterances=20, noise_factor=5,
                 add_builtin_entities_examples=True, unknown_word_prob=0,
                 unknown_words_replacement_string=None,
                 max_unknown_words=None):
        """
        Args:
            min_utterances (int, optional): The minimum number of utterances to
                automatically generate for each intent, based on the existing
                utterances. Default is 20.
            noise_factor (int, optional): Defines the size of the noise to
                generate to train the implicit *None* intent, as a multiplier
                of the average size of the other intents. Default is 5.
            add_builtin_entities_examples (bool, optional): If True, some
                builtin entity examples will be automatically added to the
                training data. Default is True.
        """
        self.min_utterances = min_utterances
        self.noise_factor = noise_factor
        self.add_builtin_entities_examples = add_builtin_entities_examples
        self.unknown_word_prob = unknown_word_prob
        self.unknown_words_replacement_string = \
            unknown_words_replacement_string
        if max_unknown_words is not None and max_unknown_words < 0:
            raise ValueError("max_unknown_words must be None or >= 0")
        self.max_unknown_words = max_unknown_words
        if unknown_word_prob > 0 and unknown_words_replacement_string is None:
            raise ValueError("unknown_word_prob is positive (%s) but the "
                             "replacement string is None" % unknown_word_prob)

    @staticmethod
    def get_required_resources():
        return {
            NOISE: True,
            STOP_WORDS: True
        }

    def to_dict(self):
        return {
            "min_utterances": self.min_utterances,
            "noise_factor": self.noise_factor,
            "add_builtin_entities_examples":
                self.add_builtin_entities_examples,
            "unknown_word_prob": self.unknown_word_prob,
            "unknown_words_replacement_string":
                self.unknown_words_replacement_string,
            "max_unknown_words": self.max_unknown_words
        }


class FeaturizerConfig(FromDict, ProcessingUnitConfig):
    """Configuration of a :class:`.Featurizer` object"""

    # pylint: disable=line-too-long
    def __init__(self, tfidf_vectorizer_config=None,
                 cooccurrence_vectorizer_config=None,
                 pvalue_threshold=0.4,
                 added_cooccurrence_feature_ratio=0):
        """
        Args:
            tfidf_vectorizer_config (:class:`.TfidfVectorizerConfig`, optional):
                empty configuration of the featurizer's
                :attr:`tfidf_vectorizer`
            cooccurrence_vectorizer_config: (:class:`.CooccurrenceVectorizerConfig`, optional):
                configuration of the featurizer's
                :attr:`cooccurrence_vectorizer`
            pvalue_threshold (float): after fitting the training set to
                extract tfidf features, a univariate feature selection is
                applied. Features are tested for independence using a Chi-2
                test, under the null hypothesis that each feature should be
                equally present in each class. Only features having a p-value
                lower than the threshold are kept
            added_cooccurrence_feature_ratio (float, optional): proportion of
                cooccurrence features to add with respect to the number of
                tfidf features. For instance with a ratio of 0.5, if 100 tfidf
                features are remaining after feature selection, a maximum of 50
                cooccurrence features will be added
        """
        self.pvalue_threshold = pvalue_threshold
        self.added_cooccurrence_feature_ratio = \
            added_cooccurrence_feature_ratio

        if tfidf_vectorizer_config is None:
            tfidf_vectorizer_config = TfidfVectorizerConfig()
        elif isinstance(tfidf_vectorizer_config, dict):
            tfidf_vectorizer_config = TfidfVectorizerConfig.from_dict(
                tfidf_vectorizer_config)
        self.tfidf_vectorizer_config = tfidf_vectorizer_config

        if cooccurrence_vectorizer_config is None:
            cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig()
        elif isinstance(cooccurrence_vectorizer_config, dict):
            cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig \
                .from_dict(cooccurrence_vectorizer_config)
        self.cooccurrence_vectorizer_config = cooccurrence_vectorizer_config

    # pylint: enable=line-too-long

    @property
    def unit_name(self):
        from snips_inference_agl.intent_classifier import Featurizer
        return Featurizer.unit_name

    def get_required_resources(self):
        required_resources = self.tfidf_vectorizer_config \
            .get_required_resources()
        if self.cooccurrence_vectorizer_config:
            required_resources = merge_required_resources(
                required_resources,
                self.cooccurrence_vectorizer_config.get_required_resources())
        return required_resources

    def to_dict(self):
        return {
            "unit_name": self.unit_name,
            "pvalue_threshold": self.pvalue_threshold,
            "added_cooccurrence_feature_ratio":
                self.added_cooccurrence_feature_ratio,
            "tfidf_vectorizer_config": self.tfidf_vectorizer_config.to_dict(),
            "cooccurrence_vectorizer_config":
                self.cooccurrence_vectorizer_config.to_dict(),
        }


class TfidfVectorizerConfig(FromDict, ProcessingUnitConfig):
    """Configuration of a :class:`.TfidfVectorizerConfig` object"""

    def __init__(self, word_clusters_name=None, use_stemming=False):
        """
        Args:
            word_clusters_name (str, optional): if a word cluster name is
                provided then the featurizer will use the word clusters IDs
                detected in the utterances and add them to the utterance text
                before computing the tfidf. Default to None
            use_stemming (bool, optional): use stemming before computing the
                tfdif. Defaults to False (no stemming used)
        """
        self.word_clusters_name = word_clusters_name
        self.use_stemming = use_stemming

    @property
    def unit_name(self):
        from snips_inference_agl.intent_classifier import TfidfVectorizer
        return TfidfVectorizer.unit_name

    def get_required_resources(self):
        resources = {STEMS: True if self.use_stemming else False}
        if self.word_clusters_name:
            resources[WORD_CLUSTERS] = {self.word_clusters_name}
        return resources

    def to_dict(self):
        return {
            "unit_name": self.unit_name,
            "word_clusters_name": self.word_clusters_name,
            "use_stemming": self.use_stemming
        }


class CooccurrenceVectorizerConfig(FromDict, ProcessingUnitConfig):
    """Configuration of a :class:`.CooccurrenceVectorizer` object"""

    def __init__(self, window_size=None, unknown_words_replacement_string=None,
                 filter_stop_words=True, keep_order=True):
        """
        Args:
            window_size (int, optional): if provided, word cooccurrences will
                be taken into account only in a context window of size
                :attr:`window_size`. If the window size is 3 then given a word
                w[i], the vectorizer will only extract the following pairs:
                (w[i], w[i + 1]), (w[i], w[i + 2]) and (w[i], w[i + 3]).
                Defaults to None, which means that we consider all words
            unknown_words_replacement_string (str, optional)
            filter_stop_words (bool, optional): if True, stop words are ignored
                when computing cooccurrences
            keep_order (bool, optional): if True then cooccurrence are computed
                taking the words order into account, which means the pairs
                (w1, w2) and (w2, w1) will count as two separate features.
                Defaults to `True`.
        """
        self.window_size = window_size
        self.unknown_words_replacement_string = \
            unknown_words_replacement_string
        self.filter_stop_words = filter_stop_words
        self.keep_order = keep_order

    @property
    def unit_name(self):
        from snips_inference_agl.intent_classifier import CooccurrenceVectorizer
        return CooccurrenceVectorizer.unit_name

    def get_required_resources(self):
        return {
            STOP_WORDS: self.filter_stop_words,
            # We require the parser to be trained without stems because we
            # don't normalize and stem when processing in the
            # CooccurrenceVectorizer (in order to run the builtin and
            # custom parser on the same unormalized input).
            # Requiring no stems ensures we'll be able to parse the unstemmed
            # input
            CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
        }

    def to_dict(self):
        return {
            "unit_name": self.unit_name,
            "unknown_words_replacement_string":
                self.unknown_words_replacement_string,
            "window_size": self.window_size,
            "filter_stop_words": self.filter_stop_words,
            "keep_order": self.keep_order
        }