summaryrefslogtreecommitdiffstats
path: root/snips_inference_agl/pipeline/configs/intent_classifier.py
diff options
context:
space:
mode:
authorMalik Talha <talhamalik727x@gmail.com>2023-10-22 21:06:23 +0500
committerJan-Simon Moeller <jsmoeller@linuxfoundation.org>2023-10-23 14:38:13 +0000
commit697a1adce1e463079e640b55d6386cf82d7bd6bc (patch)
tree86e299cc7fe12b10c2e549f640924b61c7d07a95 /snips_inference_agl/pipeline/configs/intent_classifier.py
parent97029ab8141e654a170a2282106f854037da294f (diff)
Add Snips Inference Module
Add slightly modified version of the original Snips NLU library. This module adds support for Python upto version 3.10. Bug-AGL: SPEC-4856 Signed-off-by: Malik Talha <talhamalik727x@gmail.com> Change-Id: I6d7e9eb181e6ff4aed9b6291027877ccb9f0d846
Diffstat (limited to 'snips_inference_agl/pipeline/configs/intent_classifier.py')
-rw-r--r--snips_inference_agl/pipeline/configs/intent_classifier.py307
1 files changed, 307 insertions, 0 deletions
diff --git a/snips_inference_agl/pipeline/configs/intent_classifier.py b/snips_inference_agl/pipeline/configs/intent_classifier.py
new file mode 100644
index 0000000..fc22c87
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/intent_classifier.py
@@ -0,0 +1,307 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import (
+ CUSTOM_ENTITY_PARSER_USAGE, NOISE, STEMS, STOP_WORDS, WORD_CLUSTERS)
+from snips_inference_agl.entity_parser.custom_entity_parser import (
+ CustomEntityParserUsage)
+from snips_inference_agl.pipeline.configs import Config, ProcessingUnitConfig
+from snips_inference_agl.resources import merge_required_resources
+
+
+class LogRegIntentClassifierConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.LogRegIntentClassifier`"""
+
+ # pylint: disable=line-too-long
+ def __init__(self, data_augmentation_config=None, featurizer_config=None,
+ noise_reweight_factor=1.0):
+ """
+ Args:
+ data_augmentation_config (:class:`IntentClassifierDataAugmentationConfig`):
+ Defines the strategy of the underlying data augmentation
+ featurizer_config (:class:`FeaturizerConfig`): Configuration of the
+ :class:`.Featurizer` used underneath
+ noise_reweight_factor (float, optional): this parameter allows to
+ change the weight of the None class. By default, the class
+ weights are computed using a "balanced" strategy. The
+ noise_reweight_factor allows to deviate from this strategy.
+ """
+ if data_augmentation_config is None:
+ data_augmentation_config = IntentClassifierDataAugmentationConfig()
+ if featurizer_config is None:
+ featurizer_config = FeaturizerConfig()
+ self._data_augmentation_config = None
+ self.data_augmentation_config = data_augmentation_config
+ self._featurizer_config = None
+ self.featurizer_config = featurizer_config
+ self.noise_reweight_factor = noise_reweight_factor
+
+ # pylint: enable=line-too-long
+
+ @property
+ def data_augmentation_config(self):
+ return self._data_augmentation_config
+
+ @data_augmentation_config.setter
+ def data_augmentation_config(self, value):
+ if isinstance(value, dict):
+ self._data_augmentation_config = \
+ IntentClassifierDataAugmentationConfig.from_dict(value)
+ elif isinstance(value, IntentClassifierDataAugmentationConfig):
+ self._data_augmentation_config = value
+ else:
+ raise TypeError("Expected instance of "
+ "IntentClassifierDataAugmentationConfig or dict"
+ "but received: %s" % type(value))
+
+ @property
+ def featurizer_config(self):
+ return self._featurizer_config
+
+ @featurizer_config.setter
+ def featurizer_config(self, value):
+ if isinstance(value, dict):
+ self._featurizer_config = \
+ FeaturizerConfig.from_dict(value)
+ elif isinstance(value, FeaturizerConfig):
+ self._featurizer_config = value
+ else:
+ raise TypeError("Expected instance of FeaturizerConfig or dict"
+ "but received: %s" % type(value))
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_classifier import LogRegIntentClassifier
+ return LogRegIntentClassifier.unit_name
+
+ def get_required_resources(self):
+ resources = self.data_augmentation_config.get_required_resources()
+ resources = merge_required_resources(
+ resources, self.featurizer_config.get_required_resources())
+ return resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "data_augmentation_config":
+ self.data_augmentation_config.to_dict(),
+ "featurizer_config": self.featurizer_config.to_dict(),
+ "noise_reweight_factor": self.noise_reweight_factor,
+ }
+
+
+class IntentClassifierDataAugmentationConfig(FromDict, Config):
+ """Configuration used by a :class:`.LogRegIntentClassifier` which defines
+ how to augment data to improve the training of the classifier"""
+
+ def __init__(self, min_utterances=20, noise_factor=5,
+ add_builtin_entities_examples=True, unknown_word_prob=0,
+ unknown_words_replacement_string=None,
+ max_unknown_words=None):
+ """
+ Args:
+ min_utterances (int, optional): The minimum number of utterances to
+ automatically generate for each intent, based on the existing
+ utterances. Default is 20.
+ noise_factor (int, optional): Defines the size of the noise to
+ generate to train the implicit *None* intent, as a multiplier
+ of the average size of the other intents. Default is 5.
+ add_builtin_entities_examples (bool, optional): If True, some
+ builtin entity examples will be automatically added to the
+ training data. Default is True.
+ """
+ self.min_utterances = min_utterances
+ self.noise_factor = noise_factor
+ self.add_builtin_entities_examples = add_builtin_entities_examples
+ self.unknown_word_prob = unknown_word_prob
+ self.unknown_words_replacement_string = \
+ unknown_words_replacement_string
+ if max_unknown_words is not None and max_unknown_words < 0:
+ raise ValueError("max_unknown_words must be None or >= 0")
+ self.max_unknown_words = max_unknown_words
+ if unknown_word_prob > 0 and unknown_words_replacement_string is None:
+ raise ValueError("unknown_word_prob is positive (%s) but the "
+ "replacement string is None" % unknown_word_prob)
+
+ @staticmethod
+ def get_required_resources():
+ return {
+ NOISE: True,
+ STOP_WORDS: True
+ }
+
+ def to_dict(self):
+ return {
+ "min_utterances": self.min_utterances,
+ "noise_factor": self.noise_factor,
+ "add_builtin_entities_examples":
+ self.add_builtin_entities_examples,
+ "unknown_word_prob": self.unknown_word_prob,
+ "unknown_words_replacement_string":
+ self.unknown_words_replacement_string,
+ "max_unknown_words": self.max_unknown_words
+ }
+
+
+class FeaturizerConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.Featurizer` object"""
+
+ # pylint: disable=line-too-long
+ def __init__(self, tfidf_vectorizer_config=None,
+ cooccurrence_vectorizer_config=None,
+ pvalue_threshold=0.4,
+ added_cooccurrence_feature_ratio=0):
+ """
+ Args:
+ tfidf_vectorizer_config (:class:`.TfidfVectorizerConfig`, optional):
+ empty configuration of the featurizer's
+ :attr:`tfidf_vectorizer`
+ cooccurrence_vectorizer_config: (:class:`.CooccurrenceVectorizerConfig`, optional):
+ configuration of the featurizer's
+ :attr:`cooccurrence_vectorizer`
+ pvalue_threshold (float): after fitting the training set to
+ extract tfidf features, a univariate feature selection is
+ applied. Features are tested for independence using a Chi-2
+ test, under the null hypothesis that each feature should be
+ equally present in each class. Only features having a p-value
+ lower than the threshold are kept
+ added_cooccurrence_feature_ratio (float, optional): proportion of
+ cooccurrence features to add with respect to the number of
+ tfidf features. For instance with a ratio of 0.5, if 100 tfidf
+ features are remaining after feature selection, a maximum of 50
+ cooccurrence features will be added
+ """
+ self.pvalue_threshold = pvalue_threshold
+ self.added_cooccurrence_feature_ratio = \
+ added_cooccurrence_feature_ratio
+
+ if tfidf_vectorizer_config is None:
+ tfidf_vectorizer_config = TfidfVectorizerConfig()
+ elif isinstance(tfidf_vectorizer_config, dict):
+ tfidf_vectorizer_config = TfidfVectorizerConfig.from_dict(
+ tfidf_vectorizer_config)
+ self.tfidf_vectorizer_config = tfidf_vectorizer_config
+
+ if cooccurrence_vectorizer_config is None:
+ cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig()
+ elif isinstance(cooccurrence_vectorizer_config, dict):
+ cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig \
+ .from_dict(cooccurrence_vectorizer_config)
+ self.cooccurrence_vectorizer_config = cooccurrence_vectorizer_config
+
+ # pylint: enable=line-too-long
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_classifier import Featurizer
+ return Featurizer.unit_name
+
+ def get_required_resources(self):
+ required_resources = self.tfidf_vectorizer_config \
+ .get_required_resources()
+ if self.cooccurrence_vectorizer_config:
+ required_resources = merge_required_resources(
+ required_resources,
+ self.cooccurrence_vectorizer_config.get_required_resources())
+ return required_resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "pvalue_threshold": self.pvalue_threshold,
+ "added_cooccurrence_feature_ratio":
+ self.added_cooccurrence_feature_ratio,
+ "tfidf_vectorizer_config": self.tfidf_vectorizer_config.to_dict(),
+ "cooccurrence_vectorizer_config":
+ self.cooccurrence_vectorizer_config.to_dict(),
+ }
+
+
+class TfidfVectorizerConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.TfidfVectorizerConfig` object"""
+
+ def __init__(self, word_clusters_name=None, use_stemming=False):
+ """
+ Args:
+ word_clusters_name (str, optional): if a word cluster name is
+ provided then the featurizer will use the word clusters IDs
+ detected in the utterances and add them to the utterance text
+ before computing the tfidf. Default to None
+ use_stemming (bool, optional): use stemming before computing the
+ tfdif. Defaults to False (no stemming used)
+ """
+ self.word_clusters_name = word_clusters_name
+ self.use_stemming = use_stemming
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_classifier import TfidfVectorizer
+ return TfidfVectorizer.unit_name
+
+ def get_required_resources(self):
+ resources = {STEMS: True if self.use_stemming else False}
+ if self.word_clusters_name:
+ resources[WORD_CLUSTERS] = {self.word_clusters_name}
+ return resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "word_clusters_name": self.word_clusters_name,
+ "use_stemming": self.use_stemming
+ }
+
+
+class CooccurrenceVectorizerConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.CooccurrenceVectorizer` object"""
+
+ def __init__(self, window_size=None, unknown_words_replacement_string=None,
+ filter_stop_words=True, keep_order=True):
+ """
+ Args:
+ window_size (int, optional): if provided, word cooccurrences will
+ be taken into account only in a context window of size
+ :attr:`window_size`. If the window size is 3 then given a word
+ w[i], the vectorizer will only extract the following pairs:
+ (w[i], w[i + 1]), (w[i], w[i + 2]) and (w[i], w[i + 3]).
+ Defaults to None, which means that we consider all words
+ unknown_words_replacement_string (str, optional)
+ filter_stop_words (bool, optional): if True, stop words are ignored
+ when computing cooccurrences
+ keep_order (bool, optional): if True then cooccurrence are computed
+ taking the words order into account, which means the pairs
+ (w1, w2) and (w2, w1) will count as two separate features.
+ Defaults to `True`.
+ """
+ self.window_size = window_size
+ self.unknown_words_replacement_string = \
+ unknown_words_replacement_string
+ self.filter_stop_words = filter_stop_words
+ self.keep_order = keep_order
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_classifier import CooccurrenceVectorizer
+ return CooccurrenceVectorizer.unit_name
+
+ def get_required_resources(self):
+ return {
+ STOP_WORDS: self.filter_stop_words,
+ # We require the parser to be trained without stems because we
+ # don't normalize and stem when processing in the
+ # CooccurrenceVectorizer (in order to run the builtin and
+ # custom parser on the same unormalized input).
+ # Requiring no stems ensures we'll be able to parse the unstemmed
+ # input
+ CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
+ }
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "unknown_words_replacement_string":
+ self.unknown_words_replacement_string,
+ "window_size": self.window_size,
+ "filter_stop_words": self.filter_stop_words,
+ "keep_order": self.keep_order
+ }