diff options
author | 2023-10-22 21:06:23 +0500 | |
---|---|---|
committer | 2023-10-23 14:38:13 +0000 | |
commit | 697a1adce1e463079e640b55d6386cf82d7bd6bc (patch) | |
tree | 86e299cc7fe12b10c2e549f640924b61c7d07a95 /snips_inference_agl/pipeline/configs/intent_classifier.py | |
parent | 97029ab8141e654a170a2282106f854037da294f (diff) |
Add Snips Inference Module
Add slightly modified version of the original Snips NLU
library. This module adds support for Python upto version
3.10.
Bug-AGL: SPEC-4856
Signed-off-by: Malik Talha <talhamalik727x@gmail.com>
Change-Id: I6d7e9eb181e6ff4aed9b6291027877ccb9f0d846
Diffstat (limited to 'snips_inference_agl/pipeline/configs/intent_classifier.py')
-rw-r--r-- | snips_inference_agl/pipeline/configs/intent_classifier.py | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/snips_inference_agl/pipeline/configs/intent_classifier.py b/snips_inference_agl/pipeline/configs/intent_classifier.py new file mode 100644 index 0000000..fc22c87 --- /dev/null +++ b/snips_inference_agl/pipeline/configs/intent_classifier.py @@ -0,0 +1,307 @@ +from __future__ import unicode_literals + +from snips_inference_agl.common.from_dict import FromDict +from snips_inference_agl.constants import ( + CUSTOM_ENTITY_PARSER_USAGE, NOISE, STEMS, STOP_WORDS, WORD_CLUSTERS) +from snips_inference_agl.entity_parser.custom_entity_parser import ( + CustomEntityParserUsage) +from snips_inference_agl.pipeline.configs import Config, ProcessingUnitConfig +from snips_inference_agl.resources import merge_required_resources + + +class LogRegIntentClassifierConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.LogRegIntentClassifier`""" + + # pylint: disable=line-too-long + def __init__(self, data_augmentation_config=None, featurizer_config=None, + noise_reweight_factor=1.0): + """ + Args: + data_augmentation_config (:class:`IntentClassifierDataAugmentationConfig`): + Defines the strategy of the underlying data augmentation + featurizer_config (:class:`FeaturizerConfig`): Configuration of the + :class:`.Featurizer` used underneath + noise_reweight_factor (float, optional): this parameter allows to + change the weight of the None class. By default, the class + weights are computed using a "balanced" strategy. The + noise_reweight_factor allows to deviate from this strategy. + """ + if data_augmentation_config is None: + data_augmentation_config = IntentClassifierDataAugmentationConfig() + if featurizer_config is None: + featurizer_config = FeaturizerConfig() + self._data_augmentation_config = None + self.data_augmentation_config = data_augmentation_config + self._featurizer_config = None + self.featurizer_config = featurizer_config + self.noise_reweight_factor = noise_reweight_factor + + # pylint: enable=line-too-long + + @property + def data_augmentation_config(self): + return self._data_augmentation_config + + @data_augmentation_config.setter + def data_augmentation_config(self, value): + if isinstance(value, dict): + self._data_augmentation_config = \ + IntentClassifierDataAugmentationConfig.from_dict(value) + elif isinstance(value, IntentClassifierDataAugmentationConfig): + self._data_augmentation_config = value + else: + raise TypeError("Expected instance of " + "IntentClassifierDataAugmentationConfig or dict" + "but received: %s" % type(value)) + + @property + def featurizer_config(self): + return self._featurizer_config + + @featurizer_config.setter + def featurizer_config(self, value): + if isinstance(value, dict): + self._featurizer_config = \ + FeaturizerConfig.from_dict(value) + elif isinstance(value, FeaturizerConfig): + self._featurizer_config = value + else: + raise TypeError("Expected instance of FeaturizerConfig or dict" + "but received: %s" % type(value)) + + @property + def unit_name(self): + from snips_inference_agl.intent_classifier import LogRegIntentClassifier + return LogRegIntentClassifier.unit_name + + def get_required_resources(self): + resources = self.data_augmentation_config.get_required_resources() + resources = merge_required_resources( + resources, self.featurizer_config.get_required_resources()) + return resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "data_augmentation_config": + self.data_augmentation_config.to_dict(), + "featurizer_config": self.featurizer_config.to_dict(), + "noise_reweight_factor": self.noise_reweight_factor, + } + + +class IntentClassifierDataAugmentationConfig(FromDict, Config): + """Configuration used by a :class:`.LogRegIntentClassifier` which defines + how to augment data to improve the training of the classifier""" + + def __init__(self, min_utterances=20, noise_factor=5, + add_builtin_entities_examples=True, unknown_word_prob=0, + unknown_words_replacement_string=None, + max_unknown_words=None): + """ + Args: + min_utterances (int, optional): The minimum number of utterances to + automatically generate for each intent, based on the existing + utterances. Default is 20. + noise_factor (int, optional): Defines the size of the noise to + generate to train the implicit *None* intent, as a multiplier + of the average size of the other intents. Default is 5. + add_builtin_entities_examples (bool, optional): If True, some + builtin entity examples will be automatically added to the + training data. Default is True. + """ + self.min_utterances = min_utterances + self.noise_factor = noise_factor + self.add_builtin_entities_examples = add_builtin_entities_examples + self.unknown_word_prob = unknown_word_prob + self.unknown_words_replacement_string = \ + unknown_words_replacement_string + if max_unknown_words is not None and max_unknown_words < 0: + raise ValueError("max_unknown_words must be None or >= 0") + self.max_unknown_words = max_unknown_words + if unknown_word_prob > 0 and unknown_words_replacement_string is None: + raise ValueError("unknown_word_prob is positive (%s) but the " + "replacement string is None" % unknown_word_prob) + + @staticmethod + def get_required_resources(): + return { + NOISE: True, + STOP_WORDS: True + } + + def to_dict(self): + return { + "min_utterances": self.min_utterances, + "noise_factor": self.noise_factor, + "add_builtin_entities_examples": + self.add_builtin_entities_examples, + "unknown_word_prob": self.unknown_word_prob, + "unknown_words_replacement_string": + self.unknown_words_replacement_string, + "max_unknown_words": self.max_unknown_words + } + + +class FeaturizerConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.Featurizer` object""" + + # pylint: disable=line-too-long + def __init__(self, tfidf_vectorizer_config=None, + cooccurrence_vectorizer_config=None, + pvalue_threshold=0.4, + added_cooccurrence_feature_ratio=0): + """ + Args: + tfidf_vectorizer_config (:class:`.TfidfVectorizerConfig`, optional): + empty configuration of the featurizer's + :attr:`tfidf_vectorizer` + cooccurrence_vectorizer_config: (:class:`.CooccurrenceVectorizerConfig`, optional): + configuration of the featurizer's + :attr:`cooccurrence_vectorizer` + pvalue_threshold (float): after fitting the training set to + extract tfidf features, a univariate feature selection is + applied. Features are tested for independence using a Chi-2 + test, under the null hypothesis that each feature should be + equally present in each class. Only features having a p-value + lower than the threshold are kept + added_cooccurrence_feature_ratio (float, optional): proportion of + cooccurrence features to add with respect to the number of + tfidf features. For instance with a ratio of 0.5, if 100 tfidf + features are remaining after feature selection, a maximum of 50 + cooccurrence features will be added + """ + self.pvalue_threshold = pvalue_threshold + self.added_cooccurrence_feature_ratio = \ + added_cooccurrence_feature_ratio + + if tfidf_vectorizer_config is None: + tfidf_vectorizer_config = TfidfVectorizerConfig() + elif isinstance(tfidf_vectorizer_config, dict): + tfidf_vectorizer_config = TfidfVectorizerConfig.from_dict( + tfidf_vectorizer_config) + self.tfidf_vectorizer_config = tfidf_vectorizer_config + + if cooccurrence_vectorizer_config is None: + cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig() + elif isinstance(cooccurrence_vectorizer_config, dict): + cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig \ + .from_dict(cooccurrence_vectorizer_config) + self.cooccurrence_vectorizer_config = cooccurrence_vectorizer_config + + # pylint: enable=line-too-long + + @property + def unit_name(self): + from snips_inference_agl.intent_classifier import Featurizer + return Featurizer.unit_name + + def get_required_resources(self): + required_resources = self.tfidf_vectorizer_config \ + .get_required_resources() + if self.cooccurrence_vectorizer_config: + required_resources = merge_required_resources( + required_resources, + self.cooccurrence_vectorizer_config.get_required_resources()) + return required_resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "pvalue_threshold": self.pvalue_threshold, + "added_cooccurrence_feature_ratio": + self.added_cooccurrence_feature_ratio, + "tfidf_vectorizer_config": self.tfidf_vectorizer_config.to_dict(), + "cooccurrence_vectorizer_config": + self.cooccurrence_vectorizer_config.to_dict(), + } + + +class TfidfVectorizerConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.TfidfVectorizerConfig` object""" + + def __init__(self, word_clusters_name=None, use_stemming=False): + """ + Args: + word_clusters_name (str, optional): if a word cluster name is + provided then the featurizer will use the word clusters IDs + detected in the utterances and add them to the utterance text + before computing the tfidf. Default to None + use_stemming (bool, optional): use stemming before computing the + tfdif. Defaults to False (no stemming used) + """ + self.word_clusters_name = word_clusters_name + self.use_stemming = use_stemming + + @property + def unit_name(self): + from snips_inference_agl.intent_classifier import TfidfVectorizer + return TfidfVectorizer.unit_name + + def get_required_resources(self): + resources = {STEMS: True if self.use_stemming else False} + if self.word_clusters_name: + resources[WORD_CLUSTERS] = {self.word_clusters_name} + return resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "word_clusters_name": self.word_clusters_name, + "use_stemming": self.use_stemming + } + + +class CooccurrenceVectorizerConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.CooccurrenceVectorizer` object""" + + def __init__(self, window_size=None, unknown_words_replacement_string=None, + filter_stop_words=True, keep_order=True): + """ + Args: + window_size (int, optional): if provided, word cooccurrences will + be taken into account only in a context window of size + :attr:`window_size`. If the window size is 3 then given a word + w[i], the vectorizer will only extract the following pairs: + (w[i], w[i + 1]), (w[i], w[i + 2]) and (w[i], w[i + 3]). + Defaults to None, which means that we consider all words + unknown_words_replacement_string (str, optional) + filter_stop_words (bool, optional): if True, stop words are ignored + when computing cooccurrences + keep_order (bool, optional): if True then cooccurrence are computed + taking the words order into account, which means the pairs + (w1, w2) and (w2, w1) will count as two separate features. + Defaults to `True`. + """ + self.window_size = window_size + self.unknown_words_replacement_string = \ + unknown_words_replacement_string + self.filter_stop_words = filter_stop_words + self.keep_order = keep_order + + @property + def unit_name(self): + from snips_inference_agl.intent_classifier import CooccurrenceVectorizer + return CooccurrenceVectorizer.unit_name + + def get_required_resources(self): + return { + STOP_WORDS: self.filter_stop_words, + # We require the parser to be trained without stems because we + # don't normalize and stem when processing in the + # CooccurrenceVectorizer (in order to run the builtin and + # custom parser on the same unormalized input). + # Requiring no stems ensures we'll be able to parse the unstemmed + # input + CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS + } + + def to_dict(self): + return { + "unit_name": self.unit_name, + "unknown_words_replacement_string": + self.unknown_words_replacement_string, + "window_size": self.window_size, + "filter_stop_words": self.filter_stop_words, + "keep_order": self.keep_order + } |