diff options
Diffstat (limited to 'snips_inference_agl/pipeline')
-rw-r--r-- | snips_inference_agl/pipeline/__init__.py | 0 | ||||
-rw-r--r-- | snips_inference_agl/pipeline/configs/__init__.py | 10 | ||||
-rw-r--r-- | snips_inference_agl/pipeline/configs/config.py | 49 | ||||
-rw-r--r-- | snips_inference_agl/pipeline/configs/features.py | 81 | ||||
-rw-r--r-- | snips_inference_agl/pipeline/configs/intent_classifier.py | 307 | ||||
-rw-r--r-- | snips_inference_agl/pipeline/configs/intent_parser.py | 127 | ||||
-rw-r--r-- | snips_inference_agl/pipeline/configs/nlu_engine.py | 55 | ||||
-rw-r--r-- | snips_inference_agl/pipeline/configs/slot_filler.py | 145 | ||||
-rw-r--r-- | snips_inference_agl/pipeline/processing_unit.py | 177 |
9 files changed, 951 insertions, 0 deletions
diff --git a/snips_inference_agl/pipeline/__init__.py b/snips_inference_agl/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/snips_inference_agl/pipeline/__init__.py diff --git a/snips_inference_agl/pipeline/configs/__init__.py b/snips_inference_agl/pipeline/configs/__init__.py new file mode 100644 index 0000000..027f286 --- /dev/null +++ b/snips_inference_agl/pipeline/configs/__init__.py @@ -0,0 +1,10 @@ +from .config import Config, ProcessingUnitConfig +from .features import default_features_factories +from .intent_classifier import (CooccurrenceVectorizerConfig, FeaturizerConfig, + IntentClassifierDataAugmentationConfig, + LogRegIntentClassifierConfig) +from .intent_parser import (DeterministicIntentParserConfig, + LookupIntentParserConfig, + ProbabilisticIntentParserConfig) +from .nlu_engine import NLUEngineConfig +from .slot_filler import CRFSlotFillerConfig, SlotFillerDataAugmentationConfig diff --git a/snips_inference_agl/pipeline/configs/config.py b/snips_inference_agl/pipeline/configs/config.py new file mode 100644 index 0000000..4267fa2 --- /dev/null +++ b/snips_inference_agl/pipeline/configs/config.py @@ -0,0 +1,49 @@ +from __future__ import unicode_literals + +from abc import ABCMeta, abstractmethod, abstractproperty +from builtins import object + +from future.utils import with_metaclass + + +class Config(with_metaclass(ABCMeta, object)): + @abstractmethod + def to_dict(self): + pass + + @classmethod + def from_dict(cls, obj_dict): + raise NotImplementedError + + +class ProcessingUnitConfig(with_metaclass(ABCMeta, Config)): + """Represents the configuration object needed to initialize a + :class:`.ProcessingUnit`""" + + @abstractproperty + def unit_name(self): + raise NotImplementedError + + def set_unit_name(self, value): + pass + + def get_required_resources(self): + return None + + +class DefaultProcessingUnitConfig(dict, ProcessingUnitConfig): + """Default config implemented as a simple dict""" + + @property + def unit_name(self): + return self["unit_name"] + + def set_unit_name(self, value): + self["unit_name"] = value + + def to_dict(self): + return self + + @classmethod + def from_dict(cls, obj_dict): + return cls(obj_dict) diff --git a/snips_inference_agl/pipeline/configs/features.py b/snips_inference_agl/pipeline/configs/features.py new file mode 100644 index 0000000..fa12e1a --- /dev/null +++ b/snips_inference_agl/pipeline/configs/features.py @@ -0,0 +1,81 @@ +def default_features_factories(): + """These are the default features used by the :class:`.CRFSlotFiller` + objects""" + + from snips_inference_agl.slot_filler.crf_utils import TaggingScheme + from snips_inference_agl.slot_filler.feature_factory import ( + NgramFactory, IsDigitFactory, IsFirstFactory, IsLastFactory, + ShapeNgramFactory, CustomEntityMatchFactory, BuiltinEntityMatchFactory) + + return [ + { + "args": { + "common_words_gazetteer_name": None, + "use_stemming": False, + "n": 1 + }, + "factory_name": NgramFactory.name, + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": None, + "use_stemming": False, + "n": 2 + }, + "factory_name": NgramFactory.name, + "offsets": [-2, 1] + }, + { + "args": {}, + "factory_name": IsDigitFactory.name, + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": IsFirstFactory.name, + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": IsLastFactory.name, + "offsets": [0, 1, 2] + }, + { + "args": { + "n": 1 + }, + "factory_name": ShapeNgramFactory.name, + "offsets": [0] + }, + { + "args": { + "n": 2 + }, + "factory_name": ShapeNgramFactory.name, + "offsets": [-1, 0] + }, + { + "args": { + "n": 3 + }, + "factory_name": ShapeNgramFactory.name, + "offsets": [-1] + }, + { + "args": { + "use_stemming": False, + "tagging_scheme_code": TaggingScheme.BILOU.value, + }, + "factory_name": CustomEntityMatchFactory.name, + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": { + "tagging_scheme_code": TaggingScheme.BIO.value, + }, + "factory_name": BuiltinEntityMatchFactory.name, + "offsets": [-2, -1, 0] + }, + ] diff --git a/snips_inference_agl/pipeline/configs/intent_classifier.py b/snips_inference_agl/pipeline/configs/intent_classifier.py new file mode 100644 index 0000000..fc22c87 --- /dev/null +++ b/snips_inference_agl/pipeline/configs/intent_classifier.py @@ -0,0 +1,307 @@ +from __future__ import unicode_literals + +from snips_inference_agl.common.from_dict import FromDict +from snips_inference_agl.constants import ( + CUSTOM_ENTITY_PARSER_USAGE, NOISE, STEMS, STOP_WORDS, WORD_CLUSTERS) +from snips_inference_agl.entity_parser.custom_entity_parser import ( + CustomEntityParserUsage) +from snips_inference_agl.pipeline.configs import Config, ProcessingUnitConfig +from snips_inference_agl.resources import merge_required_resources + + +class LogRegIntentClassifierConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.LogRegIntentClassifier`""" + + # pylint: disable=line-too-long + def __init__(self, data_augmentation_config=None, featurizer_config=None, + noise_reweight_factor=1.0): + """ + Args: + data_augmentation_config (:class:`IntentClassifierDataAugmentationConfig`): + Defines the strategy of the underlying data augmentation + featurizer_config (:class:`FeaturizerConfig`): Configuration of the + :class:`.Featurizer` used underneath + noise_reweight_factor (float, optional): this parameter allows to + change the weight of the None class. By default, the class + weights are computed using a "balanced" strategy. The + noise_reweight_factor allows to deviate from this strategy. + """ + if data_augmentation_config is None: + data_augmentation_config = IntentClassifierDataAugmentationConfig() + if featurizer_config is None: + featurizer_config = FeaturizerConfig() + self._data_augmentation_config = None + self.data_augmentation_config = data_augmentation_config + self._featurizer_config = None + self.featurizer_config = featurizer_config + self.noise_reweight_factor = noise_reweight_factor + + # pylint: enable=line-too-long + + @property + def data_augmentation_config(self): + return self._data_augmentation_config + + @data_augmentation_config.setter + def data_augmentation_config(self, value): + if isinstance(value, dict): + self._data_augmentation_config = \ + IntentClassifierDataAugmentationConfig.from_dict(value) + elif isinstance(value, IntentClassifierDataAugmentationConfig): + self._data_augmentation_config = value + else: + raise TypeError("Expected instance of " + "IntentClassifierDataAugmentationConfig or dict" + "but received: %s" % type(value)) + + @property + def featurizer_config(self): + return self._featurizer_config + + @featurizer_config.setter + def featurizer_config(self, value): + if isinstance(value, dict): + self._featurizer_config = \ + FeaturizerConfig.from_dict(value) + elif isinstance(value, FeaturizerConfig): + self._featurizer_config = value + else: + raise TypeError("Expected instance of FeaturizerConfig or dict" + "but received: %s" % type(value)) + + @property + def unit_name(self): + from snips_inference_agl.intent_classifier import LogRegIntentClassifier + return LogRegIntentClassifier.unit_name + + def get_required_resources(self): + resources = self.data_augmentation_config.get_required_resources() + resources = merge_required_resources( + resources, self.featurizer_config.get_required_resources()) + return resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "data_augmentation_config": + self.data_augmentation_config.to_dict(), + "featurizer_config": self.featurizer_config.to_dict(), + "noise_reweight_factor": self.noise_reweight_factor, + } + + +class IntentClassifierDataAugmentationConfig(FromDict, Config): + """Configuration used by a :class:`.LogRegIntentClassifier` which defines + how to augment data to improve the training of the classifier""" + + def __init__(self, min_utterances=20, noise_factor=5, + add_builtin_entities_examples=True, unknown_word_prob=0, + unknown_words_replacement_string=None, + max_unknown_words=None): + """ + Args: + min_utterances (int, optional): The minimum number of utterances to + automatically generate for each intent, based on the existing + utterances. Default is 20. + noise_factor (int, optional): Defines the size of the noise to + generate to train the implicit *None* intent, as a multiplier + of the average size of the other intents. Default is 5. + add_builtin_entities_examples (bool, optional): If True, some + builtin entity examples will be automatically added to the + training data. Default is True. + """ + self.min_utterances = min_utterances + self.noise_factor = noise_factor + self.add_builtin_entities_examples = add_builtin_entities_examples + self.unknown_word_prob = unknown_word_prob + self.unknown_words_replacement_string = \ + unknown_words_replacement_string + if max_unknown_words is not None and max_unknown_words < 0: + raise ValueError("max_unknown_words must be None or >= 0") + self.max_unknown_words = max_unknown_words + if unknown_word_prob > 0 and unknown_words_replacement_string is None: + raise ValueError("unknown_word_prob is positive (%s) but the " + "replacement string is None" % unknown_word_prob) + + @staticmethod + def get_required_resources(): + return { + NOISE: True, + STOP_WORDS: True + } + + def to_dict(self): + return { + "min_utterances": self.min_utterances, + "noise_factor": self.noise_factor, + "add_builtin_entities_examples": + self.add_builtin_entities_examples, + "unknown_word_prob": self.unknown_word_prob, + "unknown_words_replacement_string": + self.unknown_words_replacement_string, + "max_unknown_words": self.max_unknown_words + } + + +class FeaturizerConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.Featurizer` object""" + + # pylint: disable=line-too-long + def __init__(self, tfidf_vectorizer_config=None, + cooccurrence_vectorizer_config=None, + pvalue_threshold=0.4, + added_cooccurrence_feature_ratio=0): + """ + Args: + tfidf_vectorizer_config (:class:`.TfidfVectorizerConfig`, optional): + empty configuration of the featurizer's + :attr:`tfidf_vectorizer` + cooccurrence_vectorizer_config: (:class:`.CooccurrenceVectorizerConfig`, optional): + configuration of the featurizer's + :attr:`cooccurrence_vectorizer` + pvalue_threshold (float): after fitting the training set to + extract tfidf features, a univariate feature selection is + applied. Features are tested for independence using a Chi-2 + test, under the null hypothesis that each feature should be + equally present in each class. Only features having a p-value + lower than the threshold are kept + added_cooccurrence_feature_ratio (float, optional): proportion of + cooccurrence features to add with respect to the number of + tfidf features. For instance with a ratio of 0.5, if 100 tfidf + features are remaining after feature selection, a maximum of 50 + cooccurrence features will be added + """ + self.pvalue_threshold = pvalue_threshold + self.added_cooccurrence_feature_ratio = \ + added_cooccurrence_feature_ratio + + if tfidf_vectorizer_config is None: + tfidf_vectorizer_config = TfidfVectorizerConfig() + elif isinstance(tfidf_vectorizer_config, dict): + tfidf_vectorizer_config = TfidfVectorizerConfig.from_dict( + tfidf_vectorizer_config) + self.tfidf_vectorizer_config = tfidf_vectorizer_config + + if cooccurrence_vectorizer_config is None: + cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig() + elif isinstance(cooccurrence_vectorizer_config, dict): + cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig \ + .from_dict(cooccurrence_vectorizer_config) + self.cooccurrence_vectorizer_config = cooccurrence_vectorizer_config + + # pylint: enable=line-too-long + + @property + def unit_name(self): + from snips_inference_agl.intent_classifier import Featurizer + return Featurizer.unit_name + + def get_required_resources(self): + required_resources = self.tfidf_vectorizer_config \ + .get_required_resources() + if self.cooccurrence_vectorizer_config: + required_resources = merge_required_resources( + required_resources, + self.cooccurrence_vectorizer_config.get_required_resources()) + return required_resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "pvalue_threshold": self.pvalue_threshold, + "added_cooccurrence_feature_ratio": + self.added_cooccurrence_feature_ratio, + "tfidf_vectorizer_config": self.tfidf_vectorizer_config.to_dict(), + "cooccurrence_vectorizer_config": + self.cooccurrence_vectorizer_config.to_dict(), + } + + +class TfidfVectorizerConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.TfidfVectorizerConfig` object""" + + def __init__(self, word_clusters_name=None, use_stemming=False): + """ + Args: + word_clusters_name (str, optional): if a word cluster name is + provided then the featurizer will use the word clusters IDs + detected in the utterances and add them to the utterance text + before computing the tfidf. Default to None + use_stemming (bool, optional): use stemming before computing the + tfdif. Defaults to False (no stemming used) + """ + self.word_clusters_name = word_clusters_name + self.use_stemming = use_stemming + + @property + def unit_name(self): + from snips_inference_agl.intent_classifier import TfidfVectorizer + return TfidfVectorizer.unit_name + + def get_required_resources(self): + resources = {STEMS: True if self.use_stemming else False} + if self.word_clusters_name: + resources[WORD_CLUSTERS] = {self.word_clusters_name} + return resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "word_clusters_name": self.word_clusters_name, + "use_stemming": self.use_stemming + } + + +class CooccurrenceVectorizerConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.CooccurrenceVectorizer` object""" + + def __init__(self, window_size=None, unknown_words_replacement_string=None, + filter_stop_words=True, keep_order=True): + """ + Args: + window_size (int, optional): if provided, word cooccurrences will + be taken into account only in a context window of size + :attr:`window_size`. If the window size is 3 then given a word + w[i], the vectorizer will only extract the following pairs: + (w[i], w[i + 1]), (w[i], w[i + 2]) and (w[i], w[i + 3]). + Defaults to None, which means that we consider all words + unknown_words_replacement_string (str, optional) + filter_stop_words (bool, optional): if True, stop words are ignored + when computing cooccurrences + keep_order (bool, optional): if True then cooccurrence are computed + taking the words order into account, which means the pairs + (w1, w2) and (w2, w1) will count as two separate features. + Defaults to `True`. + """ + self.window_size = window_size + self.unknown_words_replacement_string = \ + unknown_words_replacement_string + self.filter_stop_words = filter_stop_words + self.keep_order = keep_order + + @property + def unit_name(self): + from snips_inference_agl.intent_classifier import CooccurrenceVectorizer + return CooccurrenceVectorizer.unit_name + + def get_required_resources(self): + return { + STOP_WORDS: self.filter_stop_words, + # We require the parser to be trained without stems because we + # don't normalize and stem when processing in the + # CooccurrenceVectorizer (in order to run the builtin and + # custom parser on the same unormalized input). + # Requiring no stems ensures we'll be able to parse the unstemmed + # input + CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS + } + + def to_dict(self): + return { + "unit_name": self.unit_name, + "unknown_words_replacement_string": + self.unknown_words_replacement_string, + "window_size": self.window_size, + "filter_stop_words": self.filter_stop_words, + "keep_order": self.keep_order + } diff --git a/snips_inference_agl/pipeline/configs/intent_parser.py b/snips_inference_agl/pipeline/configs/intent_parser.py new file mode 100644 index 0000000..f017472 --- /dev/null +++ b/snips_inference_agl/pipeline/configs/intent_parser.py @@ -0,0 +1,127 @@ +from __future__ import unicode_literals + +from snips_inference_agl.common.from_dict import FromDict +from snips_inference_agl.constants import CUSTOM_ENTITY_PARSER_USAGE, STOP_WORDS +from snips_inference_agl.entity_parser import CustomEntityParserUsage +from snips_inference_agl.pipeline.configs import ProcessingUnitConfig +from snips_inference_agl.resources import merge_required_resources + + +class ProbabilisticIntentParserConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.ProbabilisticIntentParser` object + + Args: + intent_classifier_config (:class:`.ProcessingUnitConfig`): The + configuration of the underlying intent classifier, by default + it uses a :class:`.LogRegIntentClassifierConfig` + slot_filler_config (:class:`.ProcessingUnitConfig`): The configuration + that will be used for the underlying slot fillers, by default it + uses a :class:`.CRFSlotFillerConfig` + """ + + def __init__(self, intent_classifier_config=None, slot_filler_config=None): + from snips_inference_agl.intent_classifier import IntentClassifier + from snips_inference_agl.slot_filler import SlotFiller + + if intent_classifier_config is None: + from snips_inference_agl.pipeline.configs import LogRegIntentClassifierConfig + intent_classifier_config = LogRegIntentClassifierConfig() + if slot_filler_config is None: + from snips_inference_agl.pipeline.configs import CRFSlotFillerConfig + slot_filler_config = CRFSlotFillerConfig() + self.intent_classifier_config = IntentClassifier.get_config( + intent_classifier_config) + self.slot_filler_config = SlotFiller.get_config(slot_filler_config) + + @property + def unit_name(self): + from snips_inference_agl.intent_parser import ProbabilisticIntentParser + return ProbabilisticIntentParser.unit_name + + def get_required_resources(self): + resources = self.intent_classifier_config.get_required_resources() + resources = merge_required_resources( + resources, self.slot_filler_config.get_required_resources()) + return resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "slot_filler_config": self.slot_filler_config.to_dict(), + "intent_classifier_config": self.intent_classifier_config.to_dict() + } + + +class DeterministicIntentParserConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.DeterministicIntentParser` + + Args: + max_queries (int, optional): Maximum number of regex patterns per + intent. 50 by default. + max_pattern_length (int, optional): Maximum length of regex patterns. + ignore_stop_words (bool, optional): If True, stop words will be + removed before building patterns. + + + This allows to deactivate the usage of regular expression when they are + too big to avoid explosion in time and memory + + Note: + In the future, a FST will be used instead of regexps, removing the need + for all this + """ + + def __init__(self, max_queries=100, max_pattern_length=1000, + ignore_stop_words=False): + self.max_queries = max_queries + self.max_pattern_length = max_pattern_length + self.ignore_stop_words = ignore_stop_words + + @property + def unit_name(self): + from snips_inference_agl.intent_parser import DeterministicIntentParser + return DeterministicIntentParser.unit_name + + def get_required_resources(self): + return { + CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS, + STOP_WORDS: self.ignore_stop_words + } + + def to_dict(self): + return { + "unit_name": self.unit_name, + "max_queries": self.max_queries, + "max_pattern_length": self.max_pattern_length, + "ignore_stop_words": self.ignore_stop_words + } + + +class LookupIntentParserConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.LookupIntentParser` + + Args: + ignore_stop_words (bool, optional): If True, stop words will be + removed before building patterns. + """ + + def __init__(self, ignore_stop_words=False): + self.ignore_stop_words = ignore_stop_words + + @property + def unit_name(self): + from snips_inference_agl.intent_parser.lookup_intent_parser import \ + LookupIntentParser + return LookupIntentParser.unit_name + + def get_required_resources(self): + return { + CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS, + STOP_WORDS: self.ignore_stop_words + } + + def to_dict(self): + return { + "unit_name": self.unit_name, + "ignore_stop_words": self.ignore_stop_words + } diff --git a/snips_inference_agl/pipeline/configs/nlu_engine.py b/snips_inference_agl/pipeline/configs/nlu_engine.py new file mode 100644 index 0000000..3826702 --- /dev/null +++ b/snips_inference_agl/pipeline/configs/nlu_engine.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from snips_inference_agl.common.from_dict import FromDict +from snips_inference_agl.constants import CUSTOM_ENTITY_PARSER_USAGE +from snips_inference_agl.entity_parser import CustomEntityParserUsage +from snips_inference_agl.pipeline.configs import ProcessingUnitConfig +from snips_inference_agl.resources import merge_required_resources + + +class NLUEngineConfig(FromDict, ProcessingUnitConfig): + """Configuration of a :class:`.SnipsNLUEngine` object + + Args: + intent_parsers_configs (list): List of intent parser configs + (:class:`.ProcessingUnitConfig`). The order in the list determines + the order in which each parser will be called by the nlu engine. + """ + + def __init__(self, intent_parsers_configs=None, random_seed=None): + from snips_inference_agl.intent_parser import IntentParser + + if intent_parsers_configs is None: + from snips_inference_agl.pipeline.configs import ( + ProbabilisticIntentParserConfig, + DeterministicIntentParserConfig) + intent_parsers_configs = [ + DeterministicIntentParserConfig(), + ProbabilisticIntentParserConfig() + ] + self.intent_parsers_configs = [ + IntentParser.get_config(conf) for conf in intent_parsers_configs] + self.random_seed = random_seed + + @property + def unit_name(self): + from snips_inference_agl.nlu_engine.nlu_engine import SnipsNLUEngine + return SnipsNLUEngine.unit_name + + def get_required_resources(self): + # Resolving custom slot values must be done without stemming + resources = { + CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS + } + for config in self.intent_parsers_configs: + resources = merge_required_resources( + resources, config.get_required_resources()) + return resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "intent_parsers_configs": [ + config.to_dict() for config in self.intent_parsers_configs + ] + } diff --git a/snips_inference_agl/pipeline/configs/slot_filler.py b/snips_inference_agl/pipeline/configs/slot_filler.py new file mode 100644 index 0000000..be36e9c --- /dev/null +++ b/snips_inference_agl/pipeline/configs/slot_filler.py @@ -0,0 +1,145 @@ +from __future__ import unicode_literals + +from snips_inference_agl.common.from_dict import FromDict +from snips_inference_agl.constants import STOP_WORDS +from snips_inference_agl.pipeline.configs import ( + Config, ProcessingUnitConfig, default_features_factories) +from snips_inference_agl.resources import merge_required_resources + + +class CRFSlotFillerConfig(FromDict, ProcessingUnitConfig): + # pylint: disable=line-too-long + """Configuration of a :class:`.CRFSlotFiller` + + Args: + feature_factory_configs (list, optional): List of configurations that + specify the list of :class:`.CRFFeatureFactory` to use with the CRF + tagging_scheme (:class:`.TaggingScheme`, optional): Tagging scheme to + use to enrich CRF labels (default=BIO) + crf_args (dict, optional): Allow to overwrite the parameters of the CRF + defined in *sklearn_crfsuite*, see :class:`sklearn_crfsuite.CRF` + (default={"c1": .1, "c2": .1, "algorithm": "lbfgs"}) + data_augmentation_config (dict or :class:`.SlotFillerDataAugmentationConfig`, optional): + Specify how to augment data before training the CRF, see the + corresponding config object for more details. + random_seed (int, optional): Specify to make the CRF training + deterministic and reproducible (default=None) + """ + + # pylint: enable=line-too-long + + def __init__(self, feature_factory_configs=None, + tagging_scheme=None, crf_args=None, + data_augmentation_config=None): + if tagging_scheme is None: + from snips_inference_agl.slot_filler.crf_utils import TaggingScheme + tagging_scheme = TaggingScheme.BIO + if feature_factory_configs is None: + feature_factory_configs = default_features_factories() + if crf_args is None: + crf_args = _default_crf_args() + if data_augmentation_config is None: + data_augmentation_config = SlotFillerDataAugmentationConfig() + self.feature_factory_configs = feature_factory_configs + self._tagging_scheme = None + self.tagging_scheme = tagging_scheme + self.crf_args = crf_args + self._data_augmentation_config = None + self.data_augmentation_config = data_augmentation_config + + @property + def tagging_scheme(self): + return self._tagging_scheme + + @tagging_scheme.setter + def tagging_scheme(self, value): + from snips_inference_agl.slot_filler.crf_utils import TaggingScheme + if isinstance(value, TaggingScheme): + self._tagging_scheme = value + elif isinstance(value, int): + self._tagging_scheme = TaggingScheme(value) + else: + raise TypeError("Expected instance of TaggingScheme or int but" + "received: %s" % type(value)) + + @property + def data_augmentation_config(self): + return self._data_augmentation_config + + @data_augmentation_config.setter + def data_augmentation_config(self, value): + if isinstance(value, dict): + self._data_augmentation_config = \ + SlotFillerDataAugmentationConfig.from_dict(value) + elif isinstance(value, SlotFillerDataAugmentationConfig): + self._data_augmentation_config = value + else: + raise TypeError("Expected instance of " + "SlotFillerDataAugmentationConfig or dict but " + "received: %s" % type(value)) + + @property + def unit_name(self): + from snips_inference_agl.slot_filler import CRFSlotFiller + return CRFSlotFiller.unit_name + + def get_required_resources(self): + # Import here to avoid circular imports + from snips_inference_agl.slot_filler.feature_factory import CRFFeatureFactory + + resources = self.data_augmentation_config.get_required_resources() + for config in self.feature_factory_configs: + factory = CRFFeatureFactory.from_config(config) + resources = merge_required_resources( + resources, factory.get_required_resources()) + return resources + + def to_dict(self): + return { + "unit_name": self.unit_name, + "feature_factory_configs": self.feature_factory_configs, + "crf_args": self.crf_args, + "tagging_scheme": self.tagging_scheme.value, + "data_augmentation_config": + self.data_augmentation_config.to_dict() + } + + +class SlotFillerDataAugmentationConfig(FromDict, Config): + """Specify how to augment data before training the CRF + + Data augmentation essentially consists in creating additional utterances + by combining utterance patterns and slot values + + Args: + min_utterances (int, optional): Specify the minimum amount of + utterances to generate per intent (default=200) + capitalization_ratio (float, optional): If an entity has one or more + capitalized values, the data augmentation will randomly capitalize + its values with a ratio of *capitalization_ratio* (default=.2) + add_builtin_entities_examples (bool, optional): If True, some builtin + entity examples will be automatically added to the training data. + Default is True. + """ + + def __init__(self, min_utterances=200, capitalization_ratio=.2, + add_builtin_entities_examples=True): + self.min_utterances = min_utterances + self.capitalization_ratio = capitalization_ratio + self.add_builtin_entities_examples = add_builtin_entities_examples + + def get_required_resources(self): + return { + STOP_WORDS: True + } + + def to_dict(self): + return { + "min_utterances": self.min_utterances, + "capitalization_ratio": self.capitalization_ratio, + "add_builtin_entities_examples": self.add_builtin_entities_examples + } + + +def _default_crf_args(): + return {"c1": .1, "c2": .1, "algorithm": "lbfgs"} diff --git a/snips_inference_agl/pipeline/processing_unit.py b/snips_inference_agl/pipeline/processing_unit.py new file mode 100644 index 0000000..1928470 --- /dev/null +++ b/snips_inference_agl/pipeline/processing_unit.py @@ -0,0 +1,177 @@ +from __future__ import unicode_literals + +import io +import json +import shutil +from abc import ABCMeta, abstractmethod, abstractproperty +from builtins import str, bytes +from pathlib import Path + +from future.utils import with_metaclass + +from snips_inference_agl.common.abc_utils import abstractclassmethod, classproperty +from snips_inference_agl.common.io_utils import temp_dir, unzip_archive +from snips_inference_agl.common.registrable import Registrable +from snips_inference_agl.common.utils import ( + json_string, check_random_state) +from snips_inference_agl.constants import ( + BUILTIN_ENTITY_PARSER, CUSTOM_ENTITY_PARSER, CUSTOM_ENTITY_PARSER_USAGE, + RESOURCES, LANGUAGE, RANDOM_STATE) +from snips_inference_agl.entity_parser import ( + BuiltinEntityParser, CustomEntityParser, CustomEntityParserUsage) +from snips_inference_agl.exceptions import LoadingError +from snips_inference_agl.pipeline.configs import ProcessingUnitConfig +from snips_inference_agl.pipeline.configs.config import DefaultProcessingUnitConfig +from snips_inference_agl.resources import load_resources + + +class ProcessingUnit(with_metaclass(ABCMeta, Registrable)): + """Abstraction of a NLU pipeline unit + + Pipeline processing units such as intent parsers, intent classifiers and + slot fillers must implement this class. + + A :class:`ProcessingUnit` is associated with a *config_type*, which + represents the :class:`.ProcessingUnitConfig` used to initialize it. + """ + + def __init__(self, config, **shared): + if config is None: + self.config = self.default_config() + elif isinstance(config, ProcessingUnitConfig): + self.config = config + elif isinstance(config, dict): + self.config = self.config_type.from_dict(config) + else: + raise ValueError("Unexpected config type: %s" % type(config)) + if self.config is not None: + self.config.set_unit_name(self.unit_name) + self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER) + self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER) + self.resources = shared.get(RESOURCES) + self.random_state = check_random_state(shared.get(RANDOM_STATE)) + + @classproperty + def config_type(cls): # pylint:disable=no-self-argument + return DefaultProcessingUnitConfig + + @classmethod + def default_config(cls): + config = cls.config_type() # pylint:disable=no-value-for-parameter + config.set_unit_name(cls.unit_name) + return config + + @classproperty + def unit_name(cls): # pylint:disable=no-self-argument + return ProcessingUnit.registered_name(cls) + + @classmethod + def from_config(cls, unit_config, **shared): + """Build a :class:`ProcessingUnit` from the provided config""" + unit = cls.by_name(unit_config.unit_name) + return unit(unit_config, **shared) + + @classmethod + def load_from_path(cls, unit_path, unit_name=None, **shared): + """Load a :class:`ProcessingUnit` from a persisted processing unit + directory + + Args: + unit_path (str or :class:`pathlib.Path`): path to the persisted + processing unit + unit_name (str, optional): Name of the processing unit to load. + By default, the unit name is assumed to be stored in a + "metadata.json" file located in the directory at unit_path. + + Raises: + LoadingError: when unit_name is None and no metadata file is found + in the processing unit directory + """ + unit_path = Path(unit_path) + if unit_name is None: + metadata_path = unit_path / "metadata.json" + if not metadata_path.exists(): + raise LoadingError( + "Missing metadata for processing unit at path %s" + % str(unit_path)) + with metadata_path.open(encoding="utf8") as f: + metadata = json.load(f) + unit_name = metadata["unit_name"] + unit = cls.by_name(unit_name) + return unit.from_path(unit_path, **shared) + + @classmethod + def get_config(cls, unit_config): + """Returns the :class:`.ProcessingUnitConfig` corresponding to + *unit_config*""" + if isinstance(unit_config, ProcessingUnitConfig): + return unit_config + elif isinstance(unit_config, dict): + unit_name = unit_config["unit_name"] + processing_unit_type = cls.by_name(unit_name) + return processing_unit_type.config_type.from_dict(unit_config) + elif isinstance(unit_config, (str, bytes)): + unit_name = unit_config + unit_config = {"unit_name": unit_name} + processing_unit_type = cls.by_name(unit_name) + return processing_unit_type.config_type.from_dict(unit_config) + else: + raise ValueError( + "Expected `unit_config` to be an instance of " + "ProcessingUnitConfig or dict or str but found: %s" + % type(unit_config)) + + @abstractproperty + def fitted(self): + """Whether or not the processing unit has already been trained""" + pass + + def load_resources_if_needed(self, language): + if self.resources is None or self.fitted: + required_resources = None + if self.config is not None: + required_resources = self.config.get_required_resources() + self.resources = load_resources(language, required_resources) + + def fit_builtin_entity_parser_if_needed(self, dataset): + # We only fit a builtin entity parser when the unit has already been + # fitted or if the parser is none. + # In the other cases the parser is provided fitted by another unit. + if self.builtin_entity_parser is None or self.fitted: + self.builtin_entity_parser = BuiltinEntityParser.build( + dataset=dataset) + return self + + def fit_custom_entity_parser_if_needed(self, dataset): + # We only fit a custom entity parser when the unit has already been + # fitted or if the parser is none. + # In the other cases the parser is provided fitted by another unit. + required_resources = self.config.get_required_resources() + if not required_resources or not required_resources.get( + CUSTOM_ENTITY_PARSER_USAGE): + # In these cases we need a custom entity parser only to do the + # final slot resolution step, which must be done without stemming. + parser_usage = CustomEntityParserUsage.WITHOUT_STEMS + else: + parser_usage = required_resources[CUSTOM_ENTITY_PARSER_USAGE] + + if self.custom_entity_parser is None or self.fitted: + self.load_resources_if_needed(dataset[LANGUAGE]) + self.custom_entity_parser = CustomEntityParser.build( + dataset, parser_usage, self.resources) + return self + + def persist_metadata(self, path, **kwargs): + metadata = {"unit_name": self.unit_name} + metadata.update(kwargs) + metadata_json = json_string(metadata) + with (path / "metadata.json").open(mode="w", encoding="utf8") as f: + f.write(metadata_json) + + # @abstractmethod + def persist(self, path): + pass + + @abstractclassmethod + def from_path(cls, path, **shared): + pass |