aboutsummaryrefslogtreecommitdiffstats
path: root/snips_inference_agl/pipeline
diff options
context:
space:
mode:
Diffstat (limited to 'snips_inference_agl/pipeline')
-rw-r--r--snips_inference_agl/pipeline/__init__.py0
-rw-r--r--snips_inference_agl/pipeline/configs/__init__.py10
-rw-r--r--snips_inference_agl/pipeline/configs/config.py49
-rw-r--r--snips_inference_agl/pipeline/configs/features.py81
-rw-r--r--snips_inference_agl/pipeline/configs/intent_classifier.py307
-rw-r--r--snips_inference_agl/pipeline/configs/intent_parser.py127
-rw-r--r--snips_inference_agl/pipeline/configs/nlu_engine.py55
-rw-r--r--snips_inference_agl/pipeline/configs/slot_filler.py145
-rw-r--r--snips_inference_agl/pipeline/processing_unit.py177
9 files changed, 951 insertions, 0 deletions
diff --git a/snips_inference_agl/pipeline/__init__.py b/snips_inference_agl/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/snips_inference_agl/pipeline/__init__.py
diff --git a/snips_inference_agl/pipeline/configs/__init__.py b/snips_inference_agl/pipeline/configs/__init__.py
new file mode 100644
index 0000000..027f286
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/__init__.py
@@ -0,0 +1,10 @@
+from .config import Config, ProcessingUnitConfig
+from .features import default_features_factories
+from .intent_classifier import (CooccurrenceVectorizerConfig, FeaturizerConfig,
+ IntentClassifierDataAugmentationConfig,
+ LogRegIntentClassifierConfig)
+from .intent_parser import (DeterministicIntentParserConfig,
+ LookupIntentParserConfig,
+ ProbabilisticIntentParserConfig)
+from .nlu_engine import NLUEngineConfig
+from .slot_filler import CRFSlotFillerConfig, SlotFillerDataAugmentationConfig
diff --git a/snips_inference_agl/pipeline/configs/config.py b/snips_inference_agl/pipeline/configs/config.py
new file mode 100644
index 0000000..4267fa2
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/config.py
@@ -0,0 +1,49 @@
+from __future__ import unicode_literals
+
+from abc import ABCMeta, abstractmethod, abstractproperty
+from builtins import object
+
+from future.utils import with_metaclass
+
+
+class Config(with_metaclass(ABCMeta, object)):
+ @abstractmethod
+ def to_dict(self):
+ pass
+
+ @classmethod
+ def from_dict(cls, obj_dict):
+ raise NotImplementedError
+
+
+class ProcessingUnitConfig(with_metaclass(ABCMeta, Config)):
+ """Represents the configuration object needed to initialize a
+ :class:`.ProcessingUnit`"""
+
+ @abstractproperty
+ def unit_name(self):
+ raise NotImplementedError
+
+ def set_unit_name(self, value):
+ pass
+
+ def get_required_resources(self):
+ return None
+
+
+class DefaultProcessingUnitConfig(dict, ProcessingUnitConfig):
+ """Default config implemented as a simple dict"""
+
+ @property
+ def unit_name(self):
+ return self["unit_name"]
+
+ def set_unit_name(self, value):
+ self["unit_name"] = value
+
+ def to_dict(self):
+ return self
+
+ @classmethod
+ def from_dict(cls, obj_dict):
+ return cls(obj_dict)
diff --git a/snips_inference_agl/pipeline/configs/features.py b/snips_inference_agl/pipeline/configs/features.py
new file mode 100644
index 0000000..fa12e1a
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/features.py
@@ -0,0 +1,81 @@
+def default_features_factories():
+ """These are the default features used by the :class:`.CRFSlotFiller`
+ objects"""
+
+ from snips_inference_agl.slot_filler.crf_utils import TaggingScheme
+ from snips_inference_agl.slot_filler.feature_factory import (
+ NgramFactory, IsDigitFactory, IsFirstFactory, IsLastFactory,
+ ShapeNgramFactory, CustomEntityMatchFactory, BuiltinEntityMatchFactory)
+
+ return [
+ {
+ "args": {
+ "common_words_gazetteer_name": None,
+ "use_stemming": False,
+ "n": 1
+ },
+ "factory_name": NgramFactory.name,
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name": None,
+ "use_stemming": False,
+ "n": 2
+ },
+ "factory_name": NgramFactory.name,
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {},
+ "factory_name": IsDigitFactory.name,
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": IsFirstFactory.name,
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": IsLastFactory.name,
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {
+ "n": 1
+ },
+ "factory_name": ShapeNgramFactory.name,
+ "offsets": [0]
+ },
+ {
+ "args": {
+ "n": 2
+ },
+ "factory_name": ShapeNgramFactory.name,
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {
+ "n": 3
+ },
+ "factory_name": ShapeNgramFactory.name,
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": False,
+ "tagging_scheme_code": TaggingScheme.BILOU.value,
+ },
+ "factory_name": CustomEntityMatchFactory.name,
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {
+ "tagging_scheme_code": TaggingScheme.BIO.value,
+ },
+ "factory_name": BuiltinEntityMatchFactory.name,
+ "offsets": [-2, -1, 0]
+ },
+ ]
diff --git a/snips_inference_agl/pipeline/configs/intent_classifier.py b/snips_inference_agl/pipeline/configs/intent_classifier.py
new file mode 100644
index 0000000..fc22c87
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/intent_classifier.py
@@ -0,0 +1,307 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import (
+ CUSTOM_ENTITY_PARSER_USAGE, NOISE, STEMS, STOP_WORDS, WORD_CLUSTERS)
+from snips_inference_agl.entity_parser.custom_entity_parser import (
+ CustomEntityParserUsage)
+from snips_inference_agl.pipeline.configs import Config, ProcessingUnitConfig
+from snips_inference_agl.resources import merge_required_resources
+
+
+class LogRegIntentClassifierConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.LogRegIntentClassifier`"""
+
+ # pylint: disable=line-too-long
+ def __init__(self, data_augmentation_config=None, featurizer_config=None,
+ noise_reweight_factor=1.0):
+ """
+ Args:
+ data_augmentation_config (:class:`IntentClassifierDataAugmentationConfig`):
+ Defines the strategy of the underlying data augmentation
+ featurizer_config (:class:`FeaturizerConfig`): Configuration of the
+ :class:`.Featurizer` used underneath
+ noise_reweight_factor (float, optional): this parameter allows to
+ change the weight of the None class. By default, the class
+ weights are computed using a "balanced" strategy. The
+ noise_reweight_factor allows to deviate from this strategy.
+ """
+ if data_augmentation_config is None:
+ data_augmentation_config = IntentClassifierDataAugmentationConfig()
+ if featurizer_config is None:
+ featurizer_config = FeaturizerConfig()
+ self._data_augmentation_config = None
+ self.data_augmentation_config = data_augmentation_config
+ self._featurizer_config = None
+ self.featurizer_config = featurizer_config
+ self.noise_reweight_factor = noise_reweight_factor
+
+ # pylint: enable=line-too-long
+
+ @property
+ def data_augmentation_config(self):
+ return self._data_augmentation_config
+
+ @data_augmentation_config.setter
+ def data_augmentation_config(self, value):
+ if isinstance(value, dict):
+ self._data_augmentation_config = \
+ IntentClassifierDataAugmentationConfig.from_dict(value)
+ elif isinstance(value, IntentClassifierDataAugmentationConfig):
+ self._data_augmentation_config = value
+ else:
+ raise TypeError("Expected instance of "
+ "IntentClassifierDataAugmentationConfig or dict"
+ "but received: %s" % type(value))
+
+ @property
+ def featurizer_config(self):
+ return self._featurizer_config
+
+ @featurizer_config.setter
+ def featurizer_config(self, value):
+ if isinstance(value, dict):
+ self._featurizer_config = \
+ FeaturizerConfig.from_dict(value)
+ elif isinstance(value, FeaturizerConfig):
+ self._featurizer_config = value
+ else:
+ raise TypeError("Expected instance of FeaturizerConfig or dict"
+ "but received: %s" % type(value))
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_classifier import LogRegIntentClassifier
+ return LogRegIntentClassifier.unit_name
+
+ def get_required_resources(self):
+ resources = self.data_augmentation_config.get_required_resources()
+ resources = merge_required_resources(
+ resources, self.featurizer_config.get_required_resources())
+ return resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "data_augmentation_config":
+ self.data_augmentation_config.to_dict(),
+ "featurizer_config": self.featurizer_config.to_dict(),
+ "noise_reweight_factor": self.noise_reweight_factor,
+ }
+
+
+class IntentClassifierDataAugmentationConfig(FromDict, Config):
+ """Configuration used by a :class:`.LogRegIntentClassifier` which defines
+ how to augment data to improve the training of the classifier"""
+
+ def __init__(self, min_utterances=20, noise_factor=5,
+ add_builtin_entities_examples=True, unknown_word_prob=0,
+ unknown_words_replacement_string=None,
+ max_unknown_words=None):
+ """
+ Args:
+ min_utterances (int, optional): The minimum number of utterances to
+ automatically generate for each intent, based on the existing
+ utterances. Default is 20.
+ noise_factor (int, optional): Defines the size of the noise to
+ generate to train the implicit *None* intent, as a multiplier
+ of the average size of the other intents. Default is 5.
+ add_builtin_entities_examples (bool, optional): If True, some
+ builtin entity examples will be automatically added to the
+ training data. Default is True.
+ """
+ self.min_utterances = min_utterances
+ self.noise_factor = noise_factor
+ self.add_builtin_entities_examples = add_builtin_entities_examples
+ self.unknown_word_prob = unknown_word_prob
+ self.unknown_words_replacement_string = \
+ unknown_words_replacement_string
+ if max_unknown_words is not None and max_unknown_words < 0:
+ raise ValueError("max_unknown_words must be None or >= 0")
+ self.max_unknown_words = max_unknown_words
+ if unknown_word_prob > 0 and unknown_words_replacement_string is None:
+ raise ValueError("unknown_word_prob is positive (%s) but the "
+ "replacement string is None" % unknown_word_prob)
+
+ @staticmethod
+ def get_required_resources():
+ return {
+ NOISE: True,
+ STOP_WORDS: True
+ }
+
+ def to_dict(self):
+ return {
+ "min_utterances": self.min_utterances,
+ "noise_factor": self.noise_factor,
+ "add_builtin_entities_examples":
+ self.add_builtin_entities_examples,
+ "unknown_word_prob": self.unknown_word_prob,
+ "unknown_words_replacement_string":
+ self.unknown_words_replacement_string,
+ "max_unknown_words": self.max_unknown_words
+ }
+
+
+class FeaturizerConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.Featurizer` object"""
+
+ # pylint: disable=line-too-long
+ def __init__(self, tfidf_vectorizer_config=None,
+ cooccurrence_vectorizer_config=None,
+ pvalue_threshold=0.4,
+ added_cooccurrence_feature_ratio=0):
+ """
+ Args:
+ tfidf_vectorizer_config (:class:`.TfidfVectorizerConfig`, optional):
+ empty configuration of the featurizer's
+ :attr:`tfidf_vectorizer`
+ cooccurrence_vectorizer_config: (:class:`.CooccurrenceVectorizerConfig`, optional):
+ configuration of the featurizer's
+ :attr:`cooccurrence_vectorizer`
+ pvalue_threshold (float): after fitting the training set to
+ extract tfidf features, a univariate feature selection is
+ applied. Features are tested for independence using a Chi-2
+ test, under the null hypothesis that each feature should be
+ equally present in each class. Only features having a p-value
+ lower than the threshold are kept
+ added_cooccurrence_feature_ratio (float, optional): proportion of
+ cooccurrence features to add with respect to the number of
+ tfidf features. For instance with a ratio of 0.5, if 100 tfidf
+ features are remaining after feature selection, a maximum of 50
+ cooccurrence features will be added
+ """
+ self.pvalue_threshold = pvalue_threshold
+ self.added_cooccurrence_feature_ratio = \
+ added_cooccurrence_feature_ratio
+
+ if tfidf_vectorizer_config is None:
+ tfidf_vectorizer_config = TfidfVectorizerConfig()
+ elif isinstance(tfidf_vectorizer_config, dict):
+ tfidf_vectorizer_config = TfidfVectorizerConfig.from_dict(
+ tfidf_vectorizer_config)
+ self.tfidf_vectorizer_config = tfidf_vectorizer_config
+
+ if cooccurrence_vectorizer_config is None:
+ cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig()
+ elif isinstance(cooccurrence_vectorizer_config, dict):
+ cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig \
+ .from_dict(cooccurrence_vectorizer_config)
+ self.cooccurrence_vectorizer_config = cooccurrence_vectorizer_config
+
+ # pylint: enable=line-too-long
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_classifier import Featurizer
+ return Featurizer.unit_name
+
+ def get_required_resources(self):
+ required_resources = self.tfidf_vectorizer_config \
+ .get_required_resources()
+ if self.cooccurrence_vectorizer_config:
+ required_resources = merge_required_resources(
+ required_resources,
+ self.cooccurrence_vectorizer_config.get_required_resources())
+ return required_resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "pvalue_threshold": self.pvalue_threshold,
+ "added_cooccurrence_feature_ratio":
+ self.added_cooccurrence_feature_ratio,
+ "tfidf_vectorizer_config": self.tfidf_vectorizer_config.to_dict(),
+ "cooccurrence_vectorizer_config":
+ self.cooccurrence_vectorizer_config.to_dict(),
+ }
+
+
+class TfidfVectorizerConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.TfidfVectorizerConfig` object"""
+
+ def __init__(self, word_clusters_name=None, use_stemming=False):
+ """
+ Args:
+ word_clusters_name (str, optional): if a word cluster name is
+ provided then the featurizer will use the word clusters IDs
+ detected in the utterances and add them to the utterance text
+ before computing the tfidf. Default to None
+ use_stemming (bool, optional): use stemming before computing the
+ tfdif. Defaults to False (no stemming used)
+ """
+ self.word_clusters_name = word_clusters_name
+ self.use_stemming = use_stemming
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_classifier import TfidfVectorizer
+ return TfidfVectorizer.unit_name
+
+ def get_required_resources(self):
+ resources = {STEMS: True if self.use_stemming else False}
+ if self.word_clusters_name:
+ resources[WORD_CLUSTERS] = {self.word_clusters_name}
+ return resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "word_clusters_name": self.word_clusters_name,
+ "use_stemming": self.use_stemming
+ }
+
+
+class CooccurrenceVectorizerConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.CooccurrenceVectorizer` object"""
+
+ def __init__(self, window_size=None, unknown_words_replacement_string=None,
+ filter_stop_words=True, keep_order=True):
+ """
+ Args:
+ window_size (int, optional): if provided, word cooccurrences will
+ be taken into account only in a context window of size
+ :attr:`window_size`. If the window size is 3 then given a word
+ w[i], the vectorizer will only extract the following pairs:
+ (w[i], w[i + 1]), (w[i], w[i + 2]) and (w[i], w[i + 3]).
+ Defaults to None, which means that we consider all words
+ unknown_words_replacement_string (str, optional)
+ filter_stop_words (bool, optional): if True, stop words are ignored
+ when computing cooccurrences
+ keep_order (bool, optional): if True then cooccurrence are computed
+ taking the words order into account, which means the pairs
+ (w1, w2) and (w2, w1) will count as two separate features.
+ Defaults to `True`.
+ """
+ self.window_size = window_size
+ self.unknown_words_replacement_string = \
+ unknown_words_replacement_string
+ self.filter_stop_words = filter_stop_words
+ self.keep_order = keep_order
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_classifier import CooccurrenceVectorizer
+ return CooccurrenceVectorizer.unit_name
+
+ def get_required_resources(self):
+ return {
+ STOP_WORDS: self.filter_stop_words,
+ # We require the parser to be trained without stems because we
+ # don't normalize and stem when processing in the
+ # CooccurrenceVectorizer (in order to run the builtin and
+ # custom parser on the same unormalized input).
+ # Requiring no stems ensures we'll be able to parse the unstemmed
+ # input
+ CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
+ }
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "unknown_words_replacement_string":
+ self.unknown_words_replacement_string,
+ "window_size": self.window_size,
+ "filter_stop_words": self.filter_stop_words,
+ "keep_order": self.keep_order
+ }
diff --git a/snips_inference_agl/pipeline/configs/intent_parser.py b/snips_inference_agl/pipeline/configs/intent_parser.py
new file mode 100644
index 0000000..f017472
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/intent_parser.py
@@ -0,0 +1,127 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import CUSTOM_ENTITY_PARSER_USAGE, STOP_WORDS
+from snips_inference_agl.entity_parser import CustomEntityParserUsage
+from snips_inference_agl.pipeline.configs import ProcessingUnitConfig
+from snips_inference_agl.resources import merge_required_resources
+
+
+class ProbabilisticIntentParserConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.ProbabilisticIntentParser` object
+
+ Args:
+ intent_classifier_config (:class:`.ProcessingUnitConfig`): The
+ configuration of the underlying intent classifier, by default
+ it uses a :class:`.LogRegIntentClassifierConfig`
+ slot_filler_config (:class:`.ProcessingUnitConfig`): The configuration
+ that will be used for the underlying slot fillers, by default it
+ uses a :class:`.CRFSlotFillerConfig`
+ """
+
+ def __init__(self, intent_classifier_config=None, slot_filler_config=None):
+ from snips_inference_agl.intent_classifier import IntentClassifier
+ from snips_inference_agl.slot_filler import SlotFiller
+
+ if intent_classifier_config is None:
+ from snips_inference_agl.pipeline.configs import LogRegIntentClassifierConfig
+ intent_classifier_config = LogRegIntentClassifierConfig()
+ if slot_filler_config is None:
+ from snips_inference_agl.pipeline.configs import CRFSlotFillerConfig
+ slot_filler_config = CRFSlotFillerConfig()
+ self.intent_classifier_config = IntentClassifier.get_config(
+ intent_classifier_config)
+ self.slot_filler_config = SlotFiller.get_config(slot_filler_config)
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_parser import ProbabilisticIntentParser
+ return ProbabilisticIntentParser.unit_name
+
+ def get_required_resources(self):
+ resources = self.intent_classifier_config.get_required_resources()
+ resources = merge_required_resources(
+ resources, self.slot_filler_config.get_required_resources())
+ return resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "slot_filler_config": self.slot_filler_config.to_dict(),
+ "intent_classifier_config": self.intent_classifier_config.to_dict()
+ }
+
+
+class DeterministicIntentParserConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.DeterministicIntentParser`
+
+ Args:
+ max_queries (int, optional): Maximum number of regex patterns per
+ intent. 50 by default.
+ max_pattern_length (int, optional): Maximum length of regex patterns.
+ ignore_stop_words (bool, optional): If True, stop words will be
+ removed before building patterns.
+
+
+ This allows to deactivate the usage of regular expression when they are
+ too big to avoid explosion in time and memory
+
+ Note:
+ In the future, a FST will be used instead of regexps, removing the need
+ for all this
+ """
+
+ def __init__(self, max_queries=100, max_pattern_length=1000,
+ ignore_stop_words=False):
+ self.max_queries = max_queries
+ self.max_pattern_length = max_pattern_length
+ self.ignore_stop_words = ignore_stop_words
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_parser import DeterministicIntentParser
+ return DeterministicIntentParser.unit_name
+
+ def get_required_resources(self):
+ return {
+ CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS,
+ STOP_WORDS: self.ignore_stop_words
+ }
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "max_queries": self.max_queries,
+ "max_pattern_length": self.max_pattern_length,
+ "ignore_stop_words": self.ignore_stop_words
+ }
+
+
+class LookupIntentParserConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.LookupIntentParser`
+
+ Args:
+ ignore_stop_words (bool, optional): If True, stop words will be
+ removed before building patterns.
+ """
+
+ def __init__(self, ignore_stop_words=False):
+ self.ignore_stop_words = ignore_stop_words
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.intent_parser.lookup_intent_parser import \
+ LookupIntentParser
+ return LookupIntentParser.unit_name
+
+ def get_required_resources(self):
+ return {
+ CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS,
+ STOP_WORDS: self.ignore_stop_words
+ }
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "ignore_stop_words": self.ignore_stop_words
+ }
diff --git a/snips_inference_agl/pipeline/configs/nlu_engine.py b/snips_inference_agl/pipeline/configs/nlu_engine.py
new file mode 100644
index 0000000..3826702
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/nlu_engine.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import CUSTOM_ENTITY_PARSER_USAGE
+from snips_inference_agl.entity_parser import CustomEntityParserUsage
+from snips_inference_agl.pipeline.configs import ProcessingUnitConfig
+from snips_inference_agl.resources import merge_required_resources
+
+
+class NLUEngineConfig(FromDict, ProcessingUnitConfig):
+ """Configuration of a :class:`.SnipsNLUEngine` object
+
+ Args:
+ intent_parsers_configs (list): List of intent parser configs
+ (:class:`.ProcessingUnitConfig`). The order in the list determines
+ the order in which each parser will be called by the nlu engine.
+ """
+
+ def __init__(self, intent_parsers_configs=None, random_seed=None):
+ from snips_inference_agl.intent_parser import IntentParser
+
+ if intent_parsers_configs is None:
+ from snips_inference_agl.pipeline.configs import (
+ ProbabilisticIntentParserConfig,
+ DeterministicIntentParserConfig)
+ intent_parsers_configs = [
+ DeterministicIntentParserConfig(),
+ ProbabilisticIntentParserConfig()
+ ]
+ self.intent_parsers_configs = [
+ IntentParser.get_config(conf) for conf in intent_parsers_configs]
+ self.random_seed = random_seed
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.nlu_engine.nlu_engine import SnipsNLUEngine
+ return SnipsNLUEngine.unit_name
+
+ def get_required_resources(self):
+ # Resolving custom slot values must be done without stemming
+ resources = {
+ CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
+ }
+ for config in self.intent_parsers_configs:
+ resources = merge_required_resources(
+ resources, config.get_required_resources())
+ return resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "intent_parsers_configs": [
+ config.to_dict() for config in self.intent_parsers_configs
+ ]
+ }
diff --git a/snips_inference_agl/pipeline/configs/slot_filler.py b/snips_inference_agl/pipeline/configs/slot_filler.py
new file mode 100644
index 0000000..be36e9c
--- /dev/null
+++ b/snips_inference_agl/pipeline/configs/slot_filler.py
@@ -0,0 +1,145 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.common.from_dict import FromDict
+from snips_inference_agl.constants import STOP_WORDS
+from snips_inference_agl.pipeline.configs import (
+ Config, ProcessingUnitConfig, default_features_factories)
+from snips_inference_agl.resources import merge_required_resources
+
+
+class CRFSlotFillerConfig(FromDict, ProcessingUnitConfig):
+ # pylint: disable=line-too-long
+ """Configuration of a :class:`.CRFSlotFiller`
+
+ Args:
+ feature_factory_configs (list, optional): List of configurations that
+ specify the list of :class:`.CRFFeatureFactory` to use with the CRF
+ tagging_scheme (:class:`.TaggingScheme`, optional): Tagging scheme to
+ use to enrich CRF labels (default=BIO)
+ crf_args (dict, optional): Allow to overwrite the parameters of the CRF
+ defined in *sklearn_crfsuite*, see :class:`sklearn_crfsuite.CRF`
+ (default={"c1": .1, "c2": .1, "algorithm": "lbfgs"})
+ data_augmentation_config (dict or :class:`.SlotFillerDataAugmentationConfig`, optional):
+ Specify how to augment data before training the CRF, see the
+ corresponding config object for more details.
+ random_seed (int, optional): Specify to make the CRF training
+ deterministic and reproducible (default=None)
+ """
+
+ # pylint: enable=line-too-long
+
+ def __init__(self, feature_factory_configs=None,
+ tagging_scheme=None, crf_args=None,
+ data_augmentation_config=None):
+ if tagging_scheme is None:
+ from snips_inference_agl.slot_filler.crf_utils import TaggingScheme
+ tagging_scheme = TaggingScheme.BIO
+ if feature_factory_configs is None:
+ feature_factory_configs = default_features_factories()
+ if crf_args is None:
+ crf_args = _default_crf_args()
+ if data_augmentation_config is None:
+ data_augmentation_config = SlotFillerDataAugmentationConfig()
+ self.feature_factory_configs = feature_factory_configs
+ self._tagging_scheme = None
+ self.tagging_scheme = tagging_scheme
+ self.crf_args = crf_args
+ self._data_augmentation_config = None
+ self.data_augmentation_config = data_augmentation_config
+
+ @property
+ def tagging_scheme(self):
+ return self._tagging_scheme
+
+ @tagging_scheme.setter
+ def tagging_scheme(self, value):
+ from snips_inference_agl.slot_filler.crf_utils import TaggingScheme
+ if isinstance(value, TaggingScheme):
+ self._tagging_scheme = value
+ elif isinstance(value, int):
+ self._tagging_scheme = TaggingScheme(value)
+ else:
+ raise TypeError("Expected instance of TaggingScheme or int but"
+ "received: %s" % type(value))
+
+ @property
+ def data_augmentation_config(self):
+ return self._data_augmentation_config
+
+ @data_augmentation_config.setter
+ def data_augmentation_config(self, value):
+ if isinstance(value, dict):
+ self._data_augmentation_config = \
+ SlotFillerDataAugmentationConfig.from_dict(value)
+ elif isinstance(value, SlotFillerDataAugmentationConfig):
+ self._data_augmentation_config = value
+ else:
+ raise TypeError("Expected instance of "
+ "SlotFillerDataAugmentationConfig or dict but "
+ "received: %s" % type(value))
+
+ @property
+ def unit_name(self):
+ from snips_inference_agl.slot_filler import CRFSlotFiller
+ return CRFSlotFiller.unit_name
+
+ def get_required_resources(self):
+ # Import here to avoid circular imports
+ from snips_inference_agl.slot_filler.feature_factory import CRFFeatureFactory
+
+ resources = self.data_augmentation_config.get_required_resources()
+ for config in self.feature_factory_configs:
+ factory = CRFFeatureFactory.from_config(config)
+ resources = merge_required_resources(
+ resources, factory.get_required_resources())
+ return resources
+
+ def to_dict(self):
+ return {
+ "unit_name": self.unit_name,
+ "feature_factory_configs": self.feature_factory_configs,
+ "crf_args": self.crf_args,
+ "tagging_scheme": self.tagging_scheme.value,
+ "data_augmentation_config":
+ self.data_augmentation_config.to_dict()
+ }
+
+
+class SlotFillerDataAugmentationConfig(FromDict, Config):
+ """Specify how to augment data before training the CRF
+
+ Data augmentation essentially consists in creating additional utterances
+ by combining utterance patterns and slot values
+
+ Args:
+ min_utterances (int, optional): Specify the minimum amount of
+ utterances to generate per intent (default=200)
+ capitalization_ratio (float, optional): If an entity has one or more
+ capitalized values, the data augmentation will randomly capitalize
+ its values with a ratio of *capitalization_ratio* (default=.2)
+ add_builtin_entities_examples (bool, optional): If True, some builtin
+ entity examples will be automatically added to the training data.
+ Default is True.
+ """
+
+ def __init__(self, min_utterances=200, capitalization_ratio=.2,
+ add_builtin_entities_examples=True):
+ self.min_utterances = min_utterances
+ self.capitalization_ratio = capitalization_ratio
+ self.add_builtin_entities_examples = add_builtin_entities_examples
+
+ def get_required_resources(self):
+ return {
+ STOP_WORDS: True
+ }
+
+ def to_dict(self):
+ return {
+ "min_utterances": self.min_utterances,
+ "capitalization_ratio": self.capitalization_ratio,
+ "add_builtin_entities_examples": self.add_builtin_entities_examples
+ }
+
+
+def _default_crf_args():
+ return {"c1": .1, "c2": .1, "algorithm": "lbfgs"}
diff --git a/snips_inference_agl/pipeline/processing_unit.py b/snips_inference_agl/pipeline/processing_unit.py
new file mode 100644
index 0000000..1928470
--- /dev/null
+++ b/snips_inference_agl/pipeline/processing_unit.py
@@ -0,0 +1,177 @@
+from __future__ import unicode_literals
+
+import io
+import json
+import shutil
+from abc import ABCMeta, abstractmethod, abstractproperty
+from builtins import str, bytes
+from pathlib import Path
+
+from future.utils import with_metaclass
+
+from snips_inference_agl.common.abc_utils import abstractclassmethod, classproperty
+from snips_inference_agl.common.io_utils import temp_dir, unzip_archive
+from snips_inference_agl.common.registrable import Registrable
+from snips_inference_agl.common.utils import (
+ json_string, check_random_state)
+from snips_inference_agl.constants import (
+ BUILTIN_ENTITY_PARSER, CUSTOM_ENTITY_PARSER, CUSTOM_ENTITY_PARSER_USAGE,
+ RESOURCES, LANGUAGE, RANDOM_STATE)
+from snips_inference_agl.entity_parser import (
+ BuiltinEntityParser, CustomEntityParser, CustomEntityParserUsage)
+from snips_inference_agl.exceptions import LoadingError
+from snips_inference_agl.pipeline.configs import ProcessingUnitConfig
+from snips_inference_agl.pipeline.configs.config import DefaultProcessingUnitConfig
+from snips_inference_agl.resources import load_resources
+
+
+class ProcessingUnit(with_metaclass(ABCMeta, Registrable)):
+ """Abstraction of a NLU pipeline unit
+
+ Pipeline processing units such as intent parsers, intent classifiers and
+ slot fillers must implement this class.
+
+ A :class:`ProcessingUnit` is associated with a *config_type*, which
+ represents the :class:`.ProcessingUnitConfig` used to initialize it.
+ """
+
+ def __init__(self, config, **shared):
+ if config is None:
+ self.config = self.default_config()
+ elif isinstance(config, ProcessingUnitConfig):
+ self.config = config
+ elif isinstance(config, dict):
+ self.config = self.config_type.from_dict(config)
+ else:
+ raise ValueError("Unexpected config type: %s" % type(config))
+ if self.config is not None:
+ self.config.set_unit_name(self.unit_name)
+ self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER)
+ self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER)
+ self.resources = shared.get(RESOURCES)
+ self.random_state = check_random_state(shared.get(RANDOM_STATE))
+
+ @classproperty
+ def config_type(cls): # pylint:disable=no-self-argument
+ return DefaultProcessingUnitConfig
+
+ @classmethod
+ def default_config(cls):
+ config = cls.config_type() # pylint:disable=no-value-for-parameter
+ config.set_unit_name(cls.unit_name)
+ return config
+
+ @classproperty
+ def unit_name(cls): # pylint:disable=no-self-argument
+ return ProcessingUnit.registered_name(cls)
+
+ @classmethod
+ def from_config(cls, unit_config, **shared):
+ """Build a :class:`ProcessingUnit` from the provided config"""
+ unit = cls.by_name(unit_config.unit_name)
+ return unit(unit_config, **shared)
+
+ @classmethod
+ def load_from_path(cls, unit_path, unit_name=None, **shared):
+ """Load a :class:`ProcessingUnit` from a persisted processing unit
+ directory
+
+ Args:
+ unit_path (str or :class:`pathlib.Path`): path to the persisted
+ processing unit
+ unit_name (str, optional): Name of the processing unit to load.
+ By default, the unit name is assumed to be stored in a
+ "metadata.json" file located in the directory at unit_path.
+
+ Raises:
+ LoadingError: when unit_name is None and no metadata file is found
+ in the processing unit directory
+ """
+ unit_path = Path(unit_path)
+ if unit_name is None:
+ metadata_path = unit_path / "metadata.json"
+ if not metadata_path.exists():
+ raise LoadingError(
+ "Missing metadata for processing unit at path %s"
+ % str(unit_path))
+ with metadata_path.open(encoding="utf8") as f:
+ metadata = json.load(f)
+ unit_name = metadata["unit_name"]
+ unit = cls.by_name(unit_name)
+ return unit.from_path(unit_path, **shared)
+
+ @classmethod
+ def get_config(cls, unit_config):
+ """Returns the :class:`.ProcessingUnitConfig` corresponding to
+ *unit_config*"""
+ if isinstance(unit_config, ProcessingUnitConfig):
+ return unit_config
+ elif isinstance(unit_config, dict):
+ unit_name = unit_config["unit_name"]
+ processing_unit_type = cls.by_name(unit_name)
+ return processing_unit_type.config_type.from_dict(unit_config)
+ elif isinstance(unit_config, (str, bytes)):
+ unit_name = unit_config
+ unit_config = {"unit_name": unit_name}
+ processing_unit_type = cls.by_name(unit_name)
+ return processing_unit_type.config_type.from_dict(unit_config)
+ else:
+ raise ValueError(
+ "Expected `unit_config` to be an instance of "
+ "ProcessingUnitConfig or dict or str but found: %s"
+ % type(unit_config))
+
+ @abstractproperty
+ def fitted(self):
+ """Whether or not the processing unit has already been trained"""
+ pass
+
+ def load_resources_if_needed(self, language):
+ if self.resources is None or self.fitted:
+ required_resources = None
+ if self.config is not None:
+ required_resources = self.config.get_required_resources()
+ self.resources = load_resources(language, required_resources)
+
+ def fit_builtin_entity_parser_if_needed(self, dataset):
+ # We only fit a builtin entity parser when the unit has already been
+ # fitted or if the parser is none.
+ # In the other cases the parser is provided fitted by another unit.
+ if self.builtin_entity_parser is None or self.fitted:
+ self.builtin_entity_parser = BuiltinEntityParser.build(
+ dataset=dataset)
+ return self
+
+ def fit_custom_entity_parser_if_needed(self, dataset):
+ # We only fit a custom entity parser when the unit has already been
+ # fitted or if the parser is none.
+ # In the other cases the parser is provided fitted by another unit.
+ required_resources = self.config.get_required_resources()
+ if not required_resources or not required_resources.get(
+ CUSTOM_ENTITY_PARSER_USAGE):
+ # In these cases we need a custom entity parser only to do the
+ # final slot resolution step, which must be done without stemming.
+ parser_usage = CustomEntityParserUsage.WITHOUT_STEMS
+ else:
+ parser_usage = required_resources[CUSTOM_ENTITY_PARSER_USAGE]
+
+ if self.custom_entity_parser is None or self.fitted:
+ self.load_resources_if_needed(dataset[LANGUAGE])
+ self.custom_entity_parser = CustomEntityParser.build(
+ dataset, parser_usage, self.resources)
+ return self
+
+ def persist_metadata(self, path, **kwargs):
+ metadata = {"unit_name": self.unit_name}
+ metadata.update(kwargs)
+ metadata_json = json_string(metadata)
+ with (path / "metadata.json").open(mode="w", encoding="utf8") as f:
+ f.write(metadata_json)
+
+ # @abstractmethod
+ def persist(self, path):
+ pass
+
+ @abstractclassmethod
+ def from_path(cls, path, **shared):
+ pass