1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
|
from __future__ import unicode_literals
from snips_inference_agl.common.from_dict import FromDict
from snips_inference_agl.constants import (
CUSTOM_ENTITY_PARSER_USAGE, NOISE, STEMS, STOP_WORDS, WORD_CLUSTERS)
from snips_inference_agl.entity_parser.custom_entity_parser import (
CustomEntityParserUsage)
from snips_inference_agl.pipeline.configs import Config, ProcessingUnitConfig
from snips_inference_agl.resources import merge_required_resources
class LogRegIntentClassifierConfig(FromDict, ProcessingUnitConfig):
"""Configuration of a :class:`.LogRegIntentClassifier`"""
# pylint: disable=line-too-long
def __init__(self, data_augmentation_config=None, featurizer_config=None,
noise_reweight_factor=1.0):
"""
Args:
data_augmentation_config (:class:`IntentClassifierDataAugmentationConfig`):
Defines the strategy of the underlying data augmentation
featurizer_config (:class:`FeaturizerConfig`): Configuration of the
:class:`.Featurizer` used underneath
noise_reweight_factor (float, optional): this parameter allows to
change the weight of the None class. By default, the class
weights are computed using a "balanced" strategy. The
noise_reweight_factor allows to deviate from this strategy.
"""
if data_augmentation_config is None:
data_augmentation_config = IntentClassifierDataAugmentationConfig()
if featurizer_config is None:
featurizer_config = FeaturizerConfig()
self._data_augmentation_config = None
self.data_augmentation_config = data_augmentation_config
self._featurizer_config = None
self.featurizer_config = featurizer_config
self.noise_reweight_factor = noise_reweight_factor
# pylint: enable=line-too-long
@property
def data_augmentation_config(self):
return self._data_augmentation_config
@data_augmentation_config.setter
def data_augmentation_config(self, value):
if isinstance(value, dict):
self._data_augmentation_config = \
IntentClassifierDataAugmentationConfig.from_dict(value)
elif isinstance(value, IntentClassifierDataAugmentationConfig):
self._data_augmentation_config = value
else:
raise TypeError("Expected instance of "
"IntentClassifierDataAugmentationConfig or dict"
"but received: %s" % type(value))
@property
def featurizer_config(self):
return self._featurizer_config
@featurizer_config.setter
def featurizer_config(self, value):
if isinstance(value, dict):
self._featurizer_config = \
FeaturizerConfig.from_dict(value)
elif isinstance(value, FeaturizerConfig):
self._featurizer_config = value
else:
raise TypeError("Expected instance of FeaturizerConfig or dict"
"but received: %s" % type(value))
@property
def unit_name(self):
from snips_inference_agl.intent_classifier import LogRegIntentClassifier
return LogRegIntentClassifier.unit_name
def get_required_resources(self):
resources = self.data_augmentation_config.get_required_resources()
resources = merge_required_resources(
resources, self.featurizer_config.get_required_resources())
return resources
def to_dict(self):
return {
"unit_name": self.unit_name,
"data_augmentation_config":
self.data_augmentation_config.to_dict(),
"featurizer_config": self.featurizer_config.to_dict(),
"noise_reweight_factor": self.noise_reweight_factor,
}
class IntentClassifierDataAugmentationConfig(FromDict, Config):
"""Configuration used by a :class:`.LogRegIntentClassifier` which defines
how to augment data to improve the training of the classifier"""
def __init__(self, min_utterances=20, noise_factor=5,
add_builtin_entities_examples=True, unknown_word_prob=0,
unknown_words_replacement_string=None,
max_unknown_words=None):
"""
Args:
min_utterances (int, optional): The minimum number of utterances to
automatically generate for each intent, based on the existing
utterances. Default is 20.
noise_factor (int, optional): Defines the size of the noise to
generate to train the implicit *None* intent, as a multiplier
of the average size of the other intents. Default is 5.
add_builtin_entities_examples (bool, optional): If True, some
builtin entity examples will be automatically added to the
training data. Default is True.
"""
self.min_utterances = min_utterances
self.noise_factor = noise_factor
self.add_builtin_entities_examples = add_builtin_entities_examples
self.unknown_word_prob = unknown_word_prob
self.unknown_words_replacement_string = \
unknown_words_replacement_string
if max_unknown_words is not None and max_unknown_words < 0:
raise ValueError("max_unknown_words must be None or >= 0")
self.max_unknown_words = max_unknown_words
if unknown_word_prob > 0 and unknown_words_replacement_string is None:
raise ValueError("unknown_word_prob is positive (%s) but the "
"replacement string is None" % unknown_word_prob)
@staticmethod
def get_required_resources():
return {
NOISE: True,
STOP_WORDS: True
}
def to_dict(self):
return {
"min_utterances": self.min_utterances,
"noise_factor": self.noise_factor,
"add_builtin_entities_examples":
self.add_builtin_entities_examples,
"unknown_word_prob": self.unknown_word_prob,
"unknown_words_replacement_string":
self.unknown_words_replacement_string,
"max_unknown_words": self.max_unknown_words
}
class FeaturizerConfig(FromDict, ProcessingUnitConfig):
"""Configuration of a :class:`.Featurizer` object"""
# pylint: disable=line-too-long
def __init__(self, tfidf_vectorizer_config=None,
cooccurrence_vectorizer_config=None,
pvalue_threshold=0.4,
added_cooccurrence_feature_ratio=0):
"""
Args:
tfidf_vectorizer_config (:class:`.TfidfVectorizerConfig`, optional):
empty configuration of the featurizer's
:attr:`tfidf_vectorizer`
cooccurrence_vectorizer_config: (:class:`.CooccurrenceVectorizerConfig`, optional):
configuration of the featurizer's
:attr:`cooccurrence_vectorizer`
pvalue_threshold (float): after fitting the training set to
extract tfidf features, a univariate feature selection is
applied. Features are tested for independence using a Chi-2
test, under the null hypothesis that each feature should be
equally present in each class. Only features having a p-value
lower than the threshold are kept
added_cooccurrence_feature_ratio (float, optional): proportion of
cooccurrence features to add with respect to the number of
tfidf features. For instance with a ratio of 0.5, if 100 tfidf
features are remaining after feature selection, a maximum of 50
cooccurrence features will be added
"""
self.pvalue_threshold = pvalue_threshold
self.added_cooccurrence_feature_ratio = \
added_cooccurrence_feature_ratio
if tfidf_vectorizer_config is None:
tfidf_vectorizer_config = TfidfVectorizerConfig()
elif isinstance(tfidf_vectorizer_config, dict):
tfidf_vectorizer_config = TfidfVectorizerConfig.from_dict(
tfidf_vectorizer_config)
self.tfidf_vectorizer_config = tfidf_vectorizer_config
if cooccurrence_vectorizer_config is None:
cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig()
elif isinstance(cooccurrence_vectorizer_config, dict):
cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig \
.from_dict(cooccurrence_vectorizer_config)
self.cooccurrence_vectorizer_config = cooccurrence_vectorizer_config
# pylint: enable=line-too-long
@property
def unit_name(self):
from snips_inference_agl.intent_classifier import Featurizer
return Featurizer.unit_name
def get_required_resources(self):
required_resources = self.tfidf_vectorizer_config \
.get_required_resources()
if self.cooccurrence_vectorizer_config:
required_resources = merge_required_resources(
required_resources,
self.cooccurrence_vectorizer_config.get_required_resources())
return required_resources
def to_dict(self):
return {
"unit_name": self.unit_name,
"pvalue_threshold": self.pvalue_threshold,
"added_cooccurrence_feature_ratio":
self.added_cooccurrence_feature_ratio,
"tfidf_vectorizer_config": self.tfidf_vectorizer_config.to_dict(),
"cooccurrence_vectorizer_config":
self.cooccurrence_vectorizer_config.to_dict(),
}
class TfidfVectorizerConfig(FromDict, ProcessingUnitConfig):
"""Configuration of a :class:`.TfidfVectorizerConfig` object"""
def __init__(self, word_clusters_name=None, use_stemming=False):
"""
Args:
word_clusters_name (str, optional): if a word cluster name is
provided then the featurizer will use the word clusters IDs
detected in the utterances and add them to the utterance text
before computing the tfidf. Default to None
use_stemming (bool, optional): use stemming before computing the
tfdif. Defaults to False (no stemming used)
"""
self.word_clusters_name = word_clusters_name
self.use_stemming = use_stemming
@property
def unit_name(self):
from snips_inference_agl.intent_classifier import TfidfVectorizer
return TfidfVectorizer.unit_name
def get_required_resources(self):
resources = {STEMS: True if self.use_stemming else False}
if self.word_clusters_name:
resources[WORD_CLUSTERS] = {self.word_clusters_name}
return resources
def to_dict(self):
return {
"unit_name": self.unit_name,
"word_clusters_name": self.word_clusters_name,
"use_stemming": self.use_stemming
}
class CooccurrenceVectorizerConfig(FromDict, ProcessingUnitConfig):
"""Configuration of a :class:`.CooccurrenceVectorizer` object"""
def __init__(self, window_size=None, unknown_words_replacement_string=None,
filter_stop_words=True, keep_order=True):
"""
Args:
window_size (int, optional): if provided, word cooccurrences will
be taken into account only in a context window of size
:attr:`window_size`. If the window size is 3 then given a word
w[i], the vectorizer will only extract the following pairs:
(w[i], w[i + 1]), (w[i], w[i + 2]) and (w[i], w[i + 3]).
Defaults to None, which means that we consider all words
unknown_words_replacement_string (str, optional)
filter_stop_words (bool, optional): if True, stop words are ignored
when computing cooccurrences
keep_order (bool, optional): if True then cooccurrence are computed
taking the words order into account, which means the pairs
(w1, w2) and (w2, w1) will count as two separate features.
Defaults to `True`.
"""
self.window_size = window_size
self.unknown_words_replacement_string = \
unknown_words_replacement_string
self.filter_stop_words = filter_stop_words
self.keep_order = keep_order
@property
def unit_name(self):
from snips_inference_agl.intent_classifier import CooccurrenceVectorizer
return CooccurrenceVectorizer.unit_name
def get_required_resources(self):
return {
STOP_WORDS: self.filter_stop_words,
# We require the parser to be trained without stems because we
# don't normalize and stem when processing in the
# CooccurrenceVectorizer (in order to run the builtin and
# custom parser on the same unormalized input).
# Requiring no stems ensures we'll be able to parse the unstemmed
# input
CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
}
def to_dict(self):
return {
"unit_name": self.unit_name,
"unknown_words_replacement_string":
self.unknown_words_replacement_string,
"window_size": self.window_size,
"filter_stop_words": self.filter_stop_words,
"keep_order": self.keep_order
}
|