Add Snips Inference Module

Add slightly modified version of the original Snips NLU library. This module adds support for Python upto version 3.10. Bug-AGL: SPEC-4856 Signed-off-by: Malik Talha <talhamalik727x@gmail.com> Change-Id: I6d7e9eb181e6ff4aed9b6291027877ccb9f0d846
author: Malik Talha <talhamalik727x@gmail.com> 2023-10-22 21:06:23 +0500
committer: Jan-Simon Moeller <jsmoeller@linuxfoundation.org> 2023-10-23 14:38:13 +0000
commit: 697a1adce1e463079e640b55d6386cf82d7bd6bc (patch)
tree: 86e299cc7fe12b10c2e549f640924b61c7d07a95 /snips_inference_agl/pipeline
parent: 97029ab8141e654a170a2282106f854037da294f (diff)
9 files changed, 951 insertions, 0 deletions
diff --git a/snips_inference_agl/pipeline/__init__.py b/snips_inference_agl/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/snips_inference_agl/pipeline/__init__.py
diff --git a/snips_inference_agl/pipeline/configs/__init__.py b/snips_inference_agl/pipeline/configs/__init__.py
new file mode 100644
index 0000000..027f286
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/__init__.py
@@ -0,0 +1,10 @@
+from .config import Config, ProcessingUnitConfig
+from .features import default_features_factories
+from .intent_classifier import (CooccurrenceVectorizerConfig, FeaturizerConfig,
+                                IntentClassifierDataAugmentationConfig,
+                                LogRegIntentClassifierConfig)
+from .intent_parser import (DeterministicIntentParserConfig,
+                            LookupIntentParserConfig,
+                            ProbabilisticIntentParserConfig)
+from .nlu_engine import NLUEngineConfig
+from .slot_filler import CRFSlotFillerConfig, SlotFillerDataAugmentationConfig
diff --git a/snips_inference_agl/pipeline/configs/config.py b/snips_inference_agl/pipeline/configs/config.py
new file mode 100644
index 0000000..4267fa2
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/config.py
@@ -0,0 +1,49 @@
+from __future__ import unicode_literals
+
+from abc import ABCMeta, abstractmethod, abstractproperty
+from builtins import object
+
+from future.utils import with_metaclass
+
+
+class Config(with_metaclass(ABCMeta, object)):
+    @abstractmethod
+    def to_dict(self):
+        pass
+
+    @classmethod
+    def from_dict(cls, obj_dict):
+        raise NotImplementedError
+
+
+class ProcessingUnitConfig(with_metaclass(ABCMeta, Config)):
+    """Represents the configuration object needed to initialize a
+        :class:`.ProcessingUnit`"""
+
+    @abstractproperty
+    def unit_name(self):
+        raise NotImplementedError
+
+    def set_unit_name(self, value):
+        pass
+
+    def get_required_resources(self):
+        return None
+
+
+class DefaultProcessingUnitConfig(dict, ProcessingUnitConfig):
+    """Default config implemented as a simple dict"""
+
+    @property
+    def unit_name(self):
+        return self["unit_name"]
+
+    def set_unit_name(self, value):
+        self["unit_name"] = value
+
+    def to_dict(self):
+        return self
+
+    @classmethod
+    def from_dict(cls, obj_dict):
+        return cls(obj_dict)
diff --git a/snips_inference_agl/pipeline/configs/features.py b/snips_inference_agl/pipeline/configs/features.py
new file mode 100644
index 0000000..fa12e1a
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/features.py
@@ -0,0 +1,81 @@
+def default_features_factories():
+    """These are the default features used by the :class:`.CRFSlotFiller`
+        objects"""
+
+    from snips_inference_agl.slot_filler.crf_utils import TaggingScheme
+    from snips_inference_agl.slot_filler.feature_factory import (
+        NgramFactory, IsDigitFactory, IsFirstFactory, IsLastFactory,
+        ShapeNgramFactory, CustomEntityMatchFactory, BuiltinEntityMatchFactory)
+
+    return [
+        {
+            "args": {
+                "common_words_gazetteer_name": None,
+                "use_stemming": False,
+                "n": 1
+            },
+            "factory_name": NgramFactory.name,
+            "offsets": [-2, -1, 0, 1, 2]
+        },
+        {
+            "args": {
+                "common_words_gazetteer_name": None,
+                "use_stemming": False,
+                "n": 2
+            },
+            "factory_name": NgramFactory.name,
+            "offsets": [-2, 1]
+        },
+        {
+            "args": {},
+            "factory_name": IsDigitFactory.name,
+            "offsets": [-1, 0, 1]
+        },
+        {
+            "args": {},
+            "factory_name": IsFirstFactory.name,
+            "offsets": [-2, -1, 0]
+        },
+        {
+            "args": {},
+            "factory_name": IsLastFactory.name,
+            "offsets": [0, 1, 2]
+        },
+        {
+            "args": {
+                "n": 1
+            },
+            "factory_name": ShapeNgramFactory.name,
+            "offsets": [0]
+        },
+        {
+            "args": {
+                "n": 2
+            },
+            "factory_name": ShapeNgramFactory.name,
+            "offsets": [-1, 0]
+        },
+        {
+            "args": {
+                "n": 3
+            },
+            "factory_name": ShapeNgramFactory.name,
+            "offsets": [-1]
+        },
+        {
+            "args": {
+                "use_stemming": False,
+                "tagging_scheme_code": TaggingScheme.BILOU.value,
+            },
+            "factory_name": CustomEntityMatchFactory.name,
+            "offsets": [-2, -1, 0],
+            "drop_out": 0.5
+        },
+        {
+            "args": {
+                "tagging_scheme_code": TaggingScheme.BIO.value,
+            },
+            "factory_name": BuiltinEntityMatchFactory.name,
+            "offsets": [-2, -1, 0]
+        },
+    ]
diff --git a/snips_inference_agl/pipeline/configs/intent_classifier.py b/snips_inference_agl/pipeline/configs/intent_classifier.py
new file mode 100644
index 0000000..fc22c87
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/intent_classifier.py
@@ -0,0 +1,307 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import (
+    CUSTOM_ENTITY_PARSER_USAGE, NOISE, STEMS, STOP_WORDS, WORD_CLUSTERS)
+from snips_inference_agl.entity_parser.custom_entity_parser import (
+    CustomEntityParserUsage)
+from snips_inference_agl.pipeline.configs import Config, ProcessingUnitConfig
+from snips_inference_agl.resources import merge_required_resources
+
+
+class LogRegIntentClassifierConfig(FromDict, ProcessingUnitConfig):
+    """Configuration of a :class:`.LogRegIntentClassifier`"""
+
+    # pylint: disable=line-too-long
+    def __init__(self, data_augmentation_config=None, featurizer_config=None,
+                 noise_reweight_factor=1.0):
+        """
+        Args:
+            data_augmentation_config (:class:`IntentClassifierDataAugmentationConfig`):
+                    Defines the strategy of the underlying data augmentation
+            featurizer_config (:class:`FeaturizerConfig`): Configuration of the
+                :class:`.Featurizer` used underneath
+            noise_reweight_factor (float, optional): this parameter allows to
+                change the weight of the None class. By default, the class
+                weights are computed using a "balanced" strategy. The
+                noise_reweight_factor allows to deviate from this strategy.
+        """
+        if data_augmentation_config is None:
+            data_augmentation_config = IntentClassifierDataAugmentationConfig()
+        if featurizer_config is None:
+            featurizer_config = FeaturizerConfig()
+        self._data_augmentation_config = None
+        self.data_augmentation_config = data_augmentation_config
+        self._featurizer_config = None
+        self.featurizer_config = featurizer_config
+        self.noise_reweight_factor = noise_reweight_factor
+
+    # pylint: enable=line-too-long
+
+    @property
+    def data_augmentation_config(self):
+        return self._data_augmentation_config
+
+    @data_augmentation_config.setter
+    def data_augmentation_config(self, value):
+        if isinstance(value, dict):
+            self._data_augmentation_config = \
+                IntentClassifierDataAugmentationConfig.from_dict(value)
+        elif isinstance(value, IntentClassifierDataAugmentationConfig):
+            self._data_augmentation_config = value
+        else:
+            raise TypeError("Expected instance of "
+                            "IntentClassifierDataAugmentationConfig or dict"
+                            "but received: %s" % type(value))
+
+    @property
+    def featurizer_config(self):
+        return self._featurizer_config
+
+    @featurizer_config.setter
+    def featurizer_config(self, value):
+        if isinstance(value, dict):
+            self._featurizer_config = \
+                FeaturizerConfig.from_dict(value)
+        elif isinstance(value, FeaturizerConfig):
+            self._featurizer_config = value
+        else:
+            raise TypeError("Expected instance of FeaturizerConfig or dict"
+                            "but received: %s" % type(value))
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.intent_classifier import LogRegIntentClassifier
+        return LogRegIntentClassifier.unit_name
+
+    def get_required_resources(self):
+        resources = self.data_augmentation_config.get_required_resources()
+        resources = merge_required_resources(
+            resources, self.featurizer_config.get_required_resources())
+        return resources
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "data_augmentation_config":
+                self.data_augmentation_config.to_dict(),
+            "featurizer_config": self.featurizer_config.to_dict(),
+            "noise_reweight_factor": self.noise_reweight_factor,
+        }
+
+
+class IntentClassifierDataAugmentationConfig(FromDict, Config):
+    """Configuration used by a :class:`.LogRegIntentClassifier` which defines
+        how to augment data to improve the training of the classifier"""
+
+    def __init__(self, min_utterances=20, noise_factor=5,
+                 add_builtin_entities_examples=True, unknown_word_prob=0,
+                 unknown_words_replacement_string=None,
+                 max_unknown_words=None):
+        """
+        Args:
+            min_utterances (int, optional): The minimum number of utterances to
+                automatically generate for each intent, based on the existing
+                utterances. Default is 20.
+            noise_factor (int, optional): Defines the size of the noise to
+                generate to train the implicit *None* intent, as a multiplier
+                of the average size of the other intents. Default is 5.
+            add_builtin_entities_examples (bool, optional): If True, some
+                builtin entity examples will be automatically added to the
+                training data. Default is True.
+        """
+        self.min_utterances = min_utterances
+        self.noise_factor = noise_factor
+        self.add_builtin_entities_examples = add_builtin_entities_examples
+        self.unknown_word_prob = unknown_word_prob
+        self.unknown_words_replacement_string = \
+            unknown_words_replacement_string
+        if max_unknown_words is not None and max_unknown_words < 0:
+            raise ValueError("max_unknown_words must be None or >= 0")
+        self.max_unknown_words = max_unknown_words
+        if unknown_word_prob > 0 and unknown_words_replacement_string is None:
+            raise ValueError("unknown_word_prob is positive (%s) but the "
+                             "replacement string is None" % unknown_word_prob)
+
+    @staticmethod
+    def get_required_resources():
+        return {
+            NOISE: True,
+            STOP_WORDS: True
+        }
+
+    def to_dict(self):
+        return {
+            "min_utterances": self.min_utterances,
+            "noise_factor": self.noise_factor,
+            "add_builtin_entities_examples":
+                self.add_builtin_entities_examples,
+            "unknown_word_prob": self.unknown_word_prob,
+            "unknown_words_replacement_string":
+                self.unknown_words_replacement_string,
+            "max_unknown_words": self.max_unknown_words
+        }
+
+
+class FeaturizerConfig(FromDict, ProcessingUnitConfig):
+    """Configuration of a :class:`.Featurizer` object"""
+
+    # pylint: disable=line-too-long
+    def __init__(self, tfidf_vectorizer_config=None,
+                 cooccurrence_vectorizer_config=None,
+                 pvalue_threshold=0.4,
+                 added_cooccurrence_feature_ratio=0):
+        """
+        Args:
+            tfidf_vectorizer_config (:class:`.TfidfVectorizerConfig`, optional):
+                empty configuration of the featurizer's
+                :attr:`tfidf_vectorizer`
+            cooccurrence_vectorizer_config: (:class:`.CooccurrenceVectorizerConfig`, optional):
+                configuration of the featurizer's
+                :attr:`cooccurrence_vectorizer`
+            pvalue_threshold (float): after fitting the training set to
+                extract tfidf features, a univariate feature selection is
+                applied. Features are tested for independence using a Chi-2
+                test, under the null hypothesis that each feature should be
+                equally present in each class. Only features having a p-value
+                lower than the threshold are kept
+            added_cooccurrence_feature_ratio (float, optional): proportion of
+                cooccurrence features to add with respect to the number of
+                tfidf features. For instance with a ratio of 0.5, if 100 tfidf
+                features are remaining after feature selection, a maximum of 50
+                cooccurrence features will be added
+        """
+        self.pvalue_threshold = pvalue_threshold
+        self.added_cooccurrence_feature_ratio = \
+            added_cooccurrence_feature_ratio
+
+        if tfidf_vectorizer_config is None:
+            tfidf_vectorizer_config = TfidfVectorizerConfig()
+        elif isinstance(tfidf_vectorizer_config, dict):
+            tfidf_vectorizer_config = TfidfVectorizerConfig.from_dict(
+                tfidf_vectorizer_config)
+        self.tfidf_vectorizer_config = tfidf_vectorizer_config
+
+        if cooccurrence_vectorizer_config is None:
+            cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig()
+        elif isinstance(cooccurrence_vectorizer_config, dict):
+            cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig \
+                .from_dict(cooccurrence_vectorizer_config)
+        self.cooccurrence_vectorizer_config = cooccurrence_vectorizer_config
+
+    # pylint: enable=line-too-long
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.intent_classifier import Featurizer
+        return Featurizer.unit_name
+
+    def get_required_resources(self):
+        required_resources = self.tfidf_vectorizer_config \
+            .get_required_resources()
+        if self.cooccurrence_vectorizer_config:
+            required_resources = merge_required_resources(
+                required_resources,
+                self.cooccurrence_vectorizer_config.get_required_resources())
+        return required_resources
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "pvalue_threshold": self.pvalue_threshold,
+            "added_cooccurrence_feature_ratio":
+                self.added_cooccurrence_feature_ratio,
+            "tfidf_vectorizer_config": self.tfidf_vectorizer_config.to_dict(),
+            "cooccurrence_vectorizer_config":
+                self.cooccurrence_vectorizer_config.to_dict(),
+        }
+
+
+class TfidfVectorizerConfig(FromDict, ProcessingUnitConfig):
+    """Configuration of a :class:`.TfidfVectorizerConfig` object"""
+
+    def __init__(self, word_clusters_name=None, use_stemming=False):
+        """
+        Args:
+            word_clusters_name (str, optional): if a word cluster name is
+                provided then the featurizer will use the word clusters IDs
+                detected in the utterances and add them to the utterance text
+                before computing the tfidf. Default to None
+            use_stemming (bool, optional): use stemming before computing the
+                tfdif. Defaults to False (no stemming used)
+        """
+        self.word_clusters_name = word_clusters_name
+        self.use_stemming = use_stemming
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.intent_classifier import TfidfVectorizer
+        return TfidfVectorizer.unit_name
+
+    def get_required_resources(self):
+        resources = {STEMS: True if self.use_stemming else False}
+        if self.word_clusters_name:
+            resources[WORD_CLUSTERS] = {self.word_clusters_name}
+        return resources
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "word_clusters_name": self.word_clusters_name,
+            "use_stemming": self.use_stemming
+        }
+
+
+class CooccurrenceVectorizerConfig(FromDict, ProcessingUnitConfig):
+    """Configuration of a :class:`.CooccurrenceVectorizer` object"""
+
+    def __init__(self, window_size=None, unknown_words_replacement_string=None,
+                 filter_stop_words=True, keep_order=True):
+        """
+        Args:
+            window_size (int, optional): if provided, word cooccurrences will
+                be taken into account only in a context window of size
+                :attr:`window_size`. If the window size is 3 then given a word
+                w[i], the vectorizer will only extract the following pairs:
+                (w[i], w[i + 1]), (w[i], w[i + 2]) and (w[i], w[i + 3]).
+                Defaults to None, which means that we consider all words
+            unknown_words_replacement_string (str, optional)
+            filter_stop_words (bool, optional): if True, stop words are ignored
+                when computing cooccurrences
+            keep_order (bool, optional): if True then cooccurrence are computed
+                taking the words order into account, which means the pairs
+                (w1, w2) and (w2, w1) will count as two separate features.
+                Defaults to `True`.
+        """
+        self.window_size = window_size
+        self.unknown_words_replacement_string = \
+            unknown_words_replacement_string
+        self.filter_stop_words = filter_stop_words
+        self.keep_order = keep_order
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.intent_classifier import CooccurrenceVectorizer
+        return CooccurrenceVectorizer.unit_name
+
+    def get_required_resources(self):
+        return {
+            STOP_WORDS: self.filter_stop_words,
+            # We require the parser to be trained without stems because we
+            # don't normalize and stem when processing in the
+            # CooccurrenceVectorizer (in order to run the builtin and
+            # custom parser on the same unormalized input).
+            # Requiring no stems ensures we'll be able to parse the unstemmed
+            # input
+            CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
+        }
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "unknown_words_replacement_string":
+                self.unknown_words_replacement_string,
+            "window_size": self.window_size,
+            "filter_stop_words": self.filter_stop_words,
+            "keep_order": self.keep_order
+        }
diff --git a/snips_inference_agl/pipeline/configs/intent_parser.py b/snips_inference_agl/pipeline/configs/intent_parser.py
new file mode 100644
index 0000000..f017472
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/intent_parser.py
@@ -0,0 +1,127 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import CUSTOM_ENTITY_PARSER_USAGE, STOP_WORDS
+from snips_inference_agl.entity_parser import CustomEntityParserUsage
+from snips_inference_agl.pipeline.configs import ProcessingUnitConfig
+from snips_inference_agl.resources import merge_required_resources
+
+
+class ProbabilisticIntentParserConfig(FromDict, ProcessingUnitConfig):
+    """Configuration of a :class:`.ProbabilisticIntentParser` object
+
+    Args:
+        intent_classifier_config (:class:`.ProcessingUnitConfig`): The
+            configuration of the underlying intent classifier, by default
+            it uses a :class:`.LogRegIntentClassifierConfig`
+        slot_filler_config (:class:`.ProcessingUnitConfig`): The configuration
+            that will be used for the underlying slot fillers, by default it
+            uses a :class:`.CRFSlotFillerConfig`
+    """
+
+    def __init__(self, intent_classifier_config=None, slot_filler_config=None):
+        from snips_inference_agl.intent_classifier import IntentClassifier
+        from snips_inference_agl.slot_filler import SlotFiller
+
+        if intent_classifier_config is None:
+            from snips_inference_agl.pipeline.configs import LogRegIntentClassifierConfig
+            intent_classifier_config = LogRegIntentClassifierConfig()
+        if slot_filler_config is None:
+            from snips_inference_agl.pipeline.configs import CRFSlotFillerConfig
+            slot_filler_config = CRFSlotFillerConfig()
+        self.intent_classifier_config = IntentClassifier.get_config(
+            intent_classifier_config)
+        self.slot_filler_config = SlotFiller.get_config(slot_filler_config)
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.intent_parser import ProbabilisticIntentParser
+        return ProbabilisticIntentParser.unit_name
+
+    def get_required_resources(self):
+        resources = self.intent_classifier_config.get_required_resources()
+        resources = merge_required_resources(
+            resources, self.slot_filler_config.get_required_resources())
+        return resources
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "slot_filler_config": self.slot_filler_config.to_dict(),
+            "intent_classifier_config": self.intent_classifier_config.to_dict()
+        }
+
+
+class DeterministicIntentParserConfig(FromDict, ProcessingUnitConfig):
+    """Configuration of a :class:`.DeterministicIntentParser`
+
+    Args:
+        max_queries (int, optional): Maximum number of regex patterns per
+            intent. 50 by default.
+        max_pattern_length (int, optional): Maximum length of regex patterns.
+        ignore_stop_words (bool, optional): If True, stop words will be
+            removed before building patterns.
+
+
+    This allows to deactivate the usage of regular expression when they are
+    too big to avoid explosion in time and memory
+
+    Note:
+        In the future, a FST will be used instead of regexps, removing the need
+        for all this
+    """
+
+    def __init__(self, max_queries=100, max_pattern_length=1000,
+                 ignore_stop_words=False):
+        self.max_queries = max_queries
+        self.max_pattern_length = max_pattern_length
+        self.ignore_stop_words = ignore_stop_words
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.intent_parser import DeterministicIntentParser
+        return DeterministicIntentParser.unit_name
+
+    def get_required_resources(self):
+        return {
+            CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS,
+            STOP_WORDS: self.ignore_stop_words
+        }
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "max_queries": self.max_queries,
+            "max_pattern_length": self.max_pattern_length,
+            "ignore_stop_words": self.ignore_stop_words
+        }
+
+
+class LookupIntentParserConfig(FromDict, ProcessingUnitConfig):
+    """Configuration of a :class:`.LookupIntentParser`
+
+    Args:
+        ignore_stop_words (bool, optional): If True, stop words will be
+            removed before building patterns.
+    """
+
+    def __init__(self, ignore_stop_words=False):
+        self.ignore_stop_words = ignore_stop_words
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.intent_parser.lookup_intent_parser import \
+            LookupIntentParser
+        return LookupIntentParser.unit_name
+
+    def get_required_resources(self):
+        return {
+            CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS,
+            STOP_WORDS: self.ignore_stop_words
+        }
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "ignore_stop_words": self.ignore_stop_words
+        }
diff --git a/snips_inference_agl/pipeline/configs/nlu_engine.py b/snips_inference_agl/pipeline/configs/nlu_engine.py
new file mode 100644
index 0000000..3826702
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/nlu_engine.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import CUSTOM_ENTITY_PARSER_USAGE
+from snips_inference_agl.entity_parser import CustomEntityParserUsage
+from snips_inference_agl.pipeline.configs import ProcessingUnitConfig
+from snips_inference_agl.resources import merge_required_resources
+
+
+class NLUEngineConfig(FromDict, ProcessingUnitConfig):
+    """Configuration of a :class:`.SnipsNLUEngine` object
+
+    Args:
+        intent_parsers_configs (list): List of intent parser configs
+            (:class:`.ProcessingUnitConfig`). The order in the list determines
+            the order in which each parser will be called by the nlu engine.
+    """
+
+    def __init__(self, intent_parsers_configs=None, random_seed=None):
+        from snips_inference_agl.intent_parser import IntentParser
+
+        if intent_parsers_configs is None:
+            from snips_inference_agl.pipeline.configs import (
+                ProbabilisticIntentParserConfig,
+                DeterministicIntentParserConfig)
+            intent_parsers_configs = [
+                DeterministicIntentParserConfig(),
+                ProbabilisticIntentParserConfig()
+            ]
+        self.intent_parsers_configs = [
+            IntentParser.get_config(conf) for conf in intent_parsers_configs]
+        self.random_seed = random_seed
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.nlu_engine.nlu_engine import SnipsNLUEngine
+        return SnipsNLUEngine.unit_name
+
+    def get_required_resources(self):
+        # Resolving custom slot values must be done without stemming
+        resources = {
+            CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
+        }
+        for config in self.intent_parsers_configs:
+            resources = merge_required_resources(
+                resources, config.get_required_resources())
+        return resources
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "intent_parsers_configs": [
+                config.to_dict() for config in self.intent_parsers_configs
+            ]
+        }
diff --git a/snips_inference_agl/pipeline/configs/slot_filler.py b/snips_inference_agl/pipeline/configs/slot_filler.py
new file mode 100644
index 0000000..be36e9c
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/slot_filler.py
@@ -0,0 +1,145 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import STOP_WORDS
+from snips_inference_agl.pipeline.configs import (
+    Config, ProcessingUnitConfig, default_features_factories)
+from snips_inference_agl.resources import merge_required_resources
+
+
+class CRFSlotFillerConfig(FromDict, ProcessingUnitConfig):
+    # pylint: disable=line-too-long
+    """Configuration of a :class:`.CRFSlotFiller`
+
+    Args:
+        feature_factory_configs (list, optional): List of configurations that
+            specify the list of :class:`.CRFFeatureFactory` to use with the CRF
+        tagging_scheme (:class:`.TaggingScheme`, optional): Tagging scheme to
+            use to enrich CRF labels (default=BIO)
+        crf_args (dict, optional): Allow to overwrite the parameters of the CRF
+            defined in *sklearn_crfsuite*, see :class:`sklearn_crfsuite.CRF`
+            (default={"c1": .1, "c2": .1, "algorithm": "lbfgs"})
+        data_augmentation_config (dict or :class:`.SlotFillerDataAugmentationConfig`, optional):
+            Specify how to augment data before training the CRF, see the
+            corresponding config object for more details.
+        random_seed (int, optional): Specify to make the CRF training
+            deterministic and reproducible (default=None)
+    """
+
+    # pylint: enable=line-too-long
+
+    def __init__(self, feature_factory_configs=None,
+                 tagging_scheme=None, crf_args=None,
+                 data_augmentation_config=None):
+        if tagging_scheme is None:
+            from snips_inference_agl.slot_filler.crf_utils import TaggingScheme
+            tagging_scheme = TaggingScheme.BIO
+        if feature_factory_configs is None:
+            feature_factory_configs = default_features_factories()
+        if crf_args is None:
+            crf_args = _default_crf_args()
+        if data_augmentation_config is None:
+            data_augmentation_config = SlotFillerDataAugmentationConfig()
+        self.feature_factory_configs = feature_factory_configs
+        self._tagging_scheme = None
+        self.tagging_scheme = tagging_scheme
+        self.crf_args = crf_args
+        self._data_augmentation_config = None
+        self.data_augmentation_config = data_augmentation_config
+
+    @property
+    def tagging_scheme(self):
+        return self._tagging_scheme
+
+    @tagging_scheme.setter
+    def tagging_scheme(self, value):
+        from snips_inference_agl.slot_filler.crf_utils import TaggingScheme
+        if isinstance(value, TaggingScheme):
+            self._tagging_scheme = value
+        elif isinstance(value, int):
+            self._tagging_scheme = TaggingScheme(value)
+        else:
+            raise TypeError("Expected instance of TaggingScheme or int but"
+                            "received: %s" % type(value))
+
+    @property
+    def data_augmentation_config(self):
+        return self._data_augmentation_config
+
+    @data_augmentation_config.setter
+    def data_augmentation_config(self, value):
+        if isinstance(value, dict):
+            self._data_augmentation_config = \
+                SlotFillerDataAugmentationConfig.from_dict(value)
+        elif isinstance(value, SlotFillerDataAugmentationConfig):
+            self._data_augmentation_config = value
+        else:
+            raise TypeError("Expected instance of "
+                            "SlotFillerDataAugmentationConfig or dict but "
+                            "received: %s" % type(value))
+
+    @property
+    def unit_name(self):
+        from snips_inference_agl.slot_filler import CRFSlotFiller
+        return CRFSlotFiller.unit_name
+
+    def get_required_resources(self):
+        # Import here to avoid circular imports
+        from snips_inference_agl.slot_filler.feature_factory import CRFFeatureFactory
+
+        resources = self.data_augmentation_config.get_required_resources()
+        for config in self.feature_factory_configs:
+            factory = CRFFeatureFactory.from_config(config)
+            resources = merge_required_resources(
+                resources, factory.get_required_resources())
+        return resources
+
+    def to_dict(self):
+        return {
+            "unit_name": self.unit_name,
+            "feature_factory_configs": self.feature_factory_configs,
+            "crf_args": self.crf_args,
+            "tagging_scheme": self.tagging_scheme.value,
+            "data_augmentation_config":
+                self.data_augmentation_config.to_dict()
+        }
+
+
+class SlotFillerDataAugmentationConfig(FromDict, Config):
+    """Specify how to augment data before training the CRF
+
+    Data augmentation essentially consists in creating additional utterances
+    by combining utterance patterns and slot values
+
+    Args:
+        min_utterances (int, optional): Specify the minimum amount of
+            utterances to generate per intent (default=200)
+        capitalization_ratio (float, optional): If an entity has one or more
+            capitalized values, the data augmentation will randomly capitalize
+            its values with a ratio of *capitalization_ratio* (default=.2)
+        add_builtin_entities_examples (bool, optional): If True, some builtin
+            entity examples will be automatically added to the training data.
+            Default is True.
+    """
+
+    def __init__(self, min_utterances=200, capitalization_ratio=.2,
+                 add_builtin_entities_examples=True):
+        self.min_utterances = min_utterances
+        self.capitalization_ratio = capitalization_ratio
+        self.add_builtin_entities_examples = add_builtin_entities_examples
+
+    def get_required_resources(self):
+        return {
+            STOP_WORDS: True
+        }
+
+    def to_dict(self):
+        return {
+            "min_utterances": self.min_utterances,
+            "capitalization_ratio": self.capitalization_ratio,
+            "add_builtin_entities_examples": self.add_builtin_entities_examples
+        }
+
+
+def _default_crf_args():
+    return {"c1": .1, "c2": .1, "algorithm": "lbfgs"}
diff --git a/snips_inference_agl/pipeline/processing_unit.py b/snips_inference_agl/pipeline/processing_unit.py
new file mode 100644
index 0000000..1928470
--- /dev/null
+++ b/snips_inference_agl/pipeline/processing_unit.py
@@ -0,0 +1,177 @@
+from __future__ import unicode_literals
+
+import io
+import json
+import shutil
+from abc import ABCMeta, abstractmethod, abstractproperty
+from builtins import str, bytes
+from pathlib import Path
+
+from future.utils import with_metaclass
+
+from snips_inference_agl.common.abc_utils import abstractclassmethod, classproperty
+from snips_inference_agl.common.io_utils import temp_dir, unzip_archive
+from snips_inference_agl.common.registrable import Registrable
+from snips_inference_agl.common.utils import (
+    json_string, check_random_state)
+from snips_inference_agl.constants import (
+    BUILTIN_ENTITY_PARSER, CUSTOM_ENTITY_PARSER, CUSTOM_ENTITY_PARSER_USAGE,
+    RESOURCES, LANGUAGE, RANDOM_STATE)
+from snips_inference_agl.entity_parser import (
+    BuiltinEntityParser, CustomEntityParser, CustomEntityParserUsage)
+from snips_inference_agl.exceptions import LoadingError
+from snips_inference_agl.pipeline.configs import ProcessingUnitConfig
+from snips_inference_agl.pipeline.configs.config import DefaultProcessingUnitConfig
+from snips_inference_agl.resources import load_resources
+
+
+class ProcessingUnit(with_metaclass(ABCMeta, Registrable)):
+    """Abstraction of a NLU pipeline unit
+
+    Pipeline processing units such as intent parsers, intent classifiers and
+    slot fillers must implement this class.
+
+     A :class:`ProcessingUnit` is associated with a *config_type*, which
+    represents the :class:`.ProcessingUnitConfig` used to initialize it.
+    """
+
+    def __init__(self, config, **shared):
+        if config is None:
+            self.config = self.default_config()
+        elif isinstance(config, ProcessingUnitConfig):
+            self.config = config
+        elif isinstance(config, dict):
+            self.config = self.config_type.from_dict(config)
+        else:
+            raise ValueError("Unexpected config type: %s" % type(config))
+        if self.config is not None:
+            self.config.set_unit_name(self.unit_name)
+        self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER)
+        self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER)
+        self.resources = shared.get(RESOURCES)
+        self.random_state = check_random_state(shared.get(RANDOM_STATE))
+
+    @classproperty
+    def config_type(cls):  # pylint:disable=no-self-argument
+        return DefaultProcessingUnitConfig
+
+    @classmethod
+    def default_config(cls):
+        config = cls.config_type()  # pylint:disable=no-value-for-parameter
+        config.set_unit_name(cls.unit_name)
+        return config
+
+    @classproperty
+    def unit_name(cls):  # pylint:disable=no-self-argument
+        return ProcessingUnit.registered_name(cls)
+
+    @classmethod
+    def from_config(cls, unit_config, **shared):
+        """Build a :class:`ProcessingUnit` from the provided config"""
+        unit = cls.by_name(unit_config.unit_name)
+        return unit(unit_config, **shared)
+
+    @classmethod
+    def load_from_path(cls, unit_path, unit_name=None, **shared):
+        """Load a :class:`ProcessingUnit` from a persisted processing unit
+        directory
+
+        Args:
+            unit_path (str or :class:`pathlib.Path`): path to the persisted
+                processing unit
+            unit_name (str, optional): Name of the processing unit to load.
+                By default, the unit name is assumed to be stored in a
+                "metadata.json" file located in the directory at unit_path.
+
+        Raises:
+            LoadingError: when unit_name is None and no metadata file is found
+                in the processing unit directory
+        """
+        unit_path = Path(unit_path)
+        if unit_name is None:
+            metadata_path = unit_path / "metadata.json"
+            if not metadata_path.exists():
+                raise LoadingError(
+                    "Missing metadata for processing unit at path %s"
+                    % str(unit_path))
+            with metadata_path.open(encoding="utf8") as f:
+                metadata = json.load(f)
+            unit_name = metadata["unit_name"]
+        unit = cls.by_name(unit_name)
+        return unit.from_path(unit_path, **shared)
+
+    @classmethod
+    def get_config(cls, unit_config):
+        """Returns the :class:`.ProcessingUnitConfig` corresponding to
+        *unit_config*"""
+        if isinstance(unit_config, ProcessingUnitConfig):
+            return unit_config
+        elif isinstance(unit_config, dict):
+            unit_name = unit_config["unit_name"]
+            processing_unit_type = cls.by_name(unit_name)
+            return processing_unit_type.config_type.from_dict(unit_config)
+        elif isinstance(unit_config, (str, bytes)):
+            unit_name = unit_config
+            unit_config = {"unit_name": unit_name}
+            processing_unit_type = cls.by_name(unit_name)
+            return processing_unit_type.config_type.from_dict(unit_config)
+        else:
+            raise ValueError(
+                "Expected `unit_config` to be an instance of "
+                "ProcessingUnitConfig or dict or str but found: %s"
+                % type(unit_config))
+
+    @abstractproperty
+    def fitted(self):
+        """Whether or not the processing unit has already been trained"""
+        pass
+
+    def load_resources_if_needed(self, language):
+        if self.resources is None or self.fitted:
+            required_resources = None
+            if self.config is not None:
+                required_resources = self.config.get_required_resources()
+            self.resources = load_resources(language, required_resources)
+
+    def fit_builtin_entity_parser_if_needed(self, dataset):
+        # We only fit a builtin entity parser when the unit has already been
+        # fitted or if the parser is none.
+        # In the other cases the parser is provided fitted by another unit.
+        if self.builtin_entity_parser is None or self.fitted:
+            self.builtin_entity_parser = BuiltinEntityParser.build(
+                dataset=dataset)
+        return self
+
+    def fit_custom_entity_parser_if_needed(self, dataset):
+        # We only fit a custom entity parser when the unit has already been
+        # fitted or if the parser is none.
+        # In the other cases the parser is provided fitted by another unit.
+        required_resources = self.config.get_required_resources()
+        if not required_resources or not required_resources.get(
+                CUSTOM_ENTITY_PARSER_USAGE):
+            # In these cases we need a custom entity parser only to do the
+            # final slot resolution step, which must be done without stemming.
+            parser_usage = CustomEntityParserUsage.WITHOUT_STEMS
+        else:
+            parser_usage = required_resources[CUSTOM_ENTITY_PARSER_USAGE]
+
+        if self.custom_entity_parser is None or self.fitted:
+            self.load_resources_if_needed(dataset[LANGUAGE])
+            self.custom_entity_parser = CustomEntityParser.build(
+                dataset, parser_usage, self.resources)
+        return self
+
+    def persist_metadata(self, path, **kwargs):
+        metadata = {"unit_name": self.unit_name}
+        metadata.update(kwargs)
+        metadata_json = json_string(metadata)
+        with (path / "metadata.json").open(mode="w", encoding="utf8") as f:
+            f.write(metadata_json)
+
+    # @abstractmethod
+    def persist(self, path):
+        pass
+
+    @abstractclassmethod
+    def from_path(cls, path, **shared):
+        pass
author	Malik Talha <talhamalik727x@gmail.com>	2023-10-22 21:06:23 +0500
committer	Jan-Simon Moeller <jsmoeller@linuxfoundation.org>	2023-10-23 14:38:13 +0000
commit	697a1adce1e463079e640b55d6386cf82d7bd6bc (patch)
tree	86e299cc7fe12b10c2e549f640924b61c7d07a95 /snips_inference_agl/pipeline
parent	97029ab8141e654a170a2282106f854037da294f (diff)