diff options
Diffstat (limited to 'snips_inference_agl/default_configs')
-rw-r--r-- | snips_inference_agl/default_configs/__init__.py | 26 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_de.py | 159 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_en.py | 145 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_es.py | 138 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_fr.py | 137 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_it.py | 137 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_ja.py | 164 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_ko.py | 155 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_pt_br.py | 137 | ||||
-rw-r--r-- | snips_inference_agl/default_configs/config_pt_pt.py | 137 |
10 files changed, 1335 insertions, 0 deletions
diff --git a/snips_inference_agl/default_configs/__init__.py b/snips_inference_agl/default_configs/__init__.py new file mode 100644 index 0000000..fc66d33 --- /dev/null +++ b/snips_inference_agl/default_configs/__init__.py @@ -0,0 +1,26 @@ +from __future__ import unicode_literals + +from snips_inference_agl.constants import ( + LANGUAGE_DE, LANGUAGE_EN, LANGUAGE_ES, LANGUAGE_FR, LANGUAGE_IT, + LANGUAGE_JA, LANGUAGE_KO, LANGUAGE_PT_BR, LANGUAGE_PT_PT) +from .config_de import CONFIG as CONFIG_DE +from .config_en import CONFIG as CONFIG_EN +from .config_es import CONFIG as CONFIG_ES +from .config_fr import CONFIG as CONFIG_FR +from .config_it import CONFIG as CONFIG_IT +from .config_ja import CONFIG as CONFIG_JA +from .config_ko import CONFIG as CONFIG_KO +from .config_pt_br import CONFIG as CONFIG_PT_BR +from .config_pt_pt import CONFIG as CONFIG_PT_PT + +DEFAULT_CONFIGS = { + LANGUAGE_DE: CONFIG_DE, + LANGUAGE_EN: CONFIG_EN, + LANGUAGE_ES: CONFIG_ES, + LANGUAGE_FR: CONFIG_FR, + LANGUAGE_IT: CONFIG_IT, + LANGUAGE_JA: CONFIG_JA, + LANGUAGE_KO: CONFIG_KO, + LANGUAGE_PT_BR: CONFIG_PT_BR, + LANGUAGE_PT_PT: CONFIG_PT_PT, +} diff --git a/snips_inference_agl/default_configs/config_de.py b/snips_inference_agl/default_configs/config_de.py new file mode 100644 index 0000000..200fc30 --- /dev/null +++ b/snips_inference_agl/default_configs/config_de.py @@ -0,0 +1,159 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_200000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": + "top_200000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 1] + }, + { + "args": { + "prefix_size": 2 + }, + "factory_name": "prefix", + "offsets": [0] + }, + { + "args": {"prefix_size": 5}, + "factory_name": "prefix", + "offsets": [0] + }, + { + "args": {"suffix_size": 2}, + "factory_name": "suffix", + "offsets": [0] + }, + { + "args": {"suffix_size": 5}, + "factory_name": "suffix", + "offsets": [0] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": True + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-2, -1, 0] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + } + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": True, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "noise_reweight_factor": 1, + } + } + ] +} diff --git a/snips_inference_agl/default_configs/config_en.py b/snips_inference_agl/default_configs/config_en.py new file mode 100644 index 0000000..12f7ae1 --- /dev/null +++ b/snips_inference_agl/default_configs/config_en.py @@ -0,0 +1,145 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 1] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": True + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "cluster_name": "brown_clusters", + "use_stemming": False + }, + "factory_name": "word_cluster", + "offsets": [-2, -1, 0, 1] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + } + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": False, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "noise_reweight_factor": 1, + } + } + ] +} diff --git a/snips_inference_agl/default_configs/config_es.py b/snips_inference_agl/default_configs/config_es.py new file mode 100644 index 0000000..28969ce --- /dev/null +++ b/snips_inference_agl/default_configs/config_es.py @@ -0,0 +1,138 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 1] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": True + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-2, -1, 0] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + }, + + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": True, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "noise_reweight_factor": 1, + } + } + ] +} diff --git a/snips_inference_agl/default_configs/config_fr.py b/snips_inference_agl/default_configs/config_fr.py new file mode 100644 index 0000000..a2da590 --- /dev/null +++ b/snips_inference_agl/default_configs/config_fr.py @@ -0,0 +1,137 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 1] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": True + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-2, -1, 0] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + } + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": True, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "noise_reweight_factor": 1, + } + } + ] +} diff --git a/snips_inference_agl/default_configs/config_it.py b/snips_inference_agl/default_configs/config_it.py new file mode 100644 index 0000000..a2da590 --- /dev/null +++ b/snips_inference_agl/default_configs/config_it.py @@ -0,0 +1,137 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 1] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": True + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-2, -1, 0] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + } + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": True, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "noise_reweight_factor": 1, + } + } + ] +} diff --git a/snips_inference_agl/default_configs/config_ja.py b/snips_inference_agl/default_configs/config_ja.py new file mode 100644 index 0000000..b28791f --- /dev/null +++ b/snips_inference_agl/default_configs/config_ja.py @@ -0,0 +1,164 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": False + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": None, + "use_stemming": False, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": None, + "use_stemming": False, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 0, 1, 2] + }, + { + "args": {"prefix_size": 1}, + "factory_name": "prefix", + "offsets": [0, 1] + }, + { + "args": {"prefix_size": 2}, + "factory_name": "prefix", + "offsets": [0, 1] + }, + { + "args": {"suffix_size": 1}, + "factory_name": "suffix", + "offsets": [0, 1] + }, + { + "args": {"suffix_size": 2}, + "factory_name": "suffix", + "offsets": [0, 1] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": False, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-1, 0, 1, 2], + }, + { + "args": { + "use_stemming": False, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-1, 0, 1, 2], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-1, 0, 1, 2] + }, + { + "args": { + "cluster_name": "w2v_clusters", + "use_stemming": False + }, + "factory_name": "word_cluster", + "offsets": [-2, -1, 0, 1, 2] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + }, + + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.9, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": False, + "word_clusters_name": "w2v_clusters" + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "noise_reweight_factor": 1, + } + } + ] +} diff --git a/snips_inference_agl/default_configs/config_ko.py b/snips_inference_agl/default_configs/config_ko.py new file mode 100644 index 0000000..1630796 --- /dev/null +++ b/snips_inference_agl/default_configs/config_ko.py @@ -0,0 +1,155 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": False + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": None, + "use_stemming": False, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": None, + "use_stemming": False, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 1] + }, + { + "args": {"prefix_size": 1}, + "factory_name": "prefix", + "offsets": [0] + }, + { + "args": {"prefix_size": 2}, + "factory_name": "prefix", + "offsets": [0] + }, + { + "args": {"suffix_size": 1}, + "factory_name": "suffix", + "offsets": [0] + }, + { + "args": {"suffix_size": 2}, + "factory_name": "suffix", + "offsets": [0] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": False, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "use_stemming": False, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": True + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-2, -1, 0] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + } + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": False, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "noise_reweight_factor": 1, + } + } + ] +} diff --git a/snips_inference_agl/default_configs/config_pt_br.py b/snips_inference_agl/default_configs/config_pt_br.py new file mode 100644 index 0000000..450f0db --- /dev/null +++ b/snips_inference_agl/default_configs/config_pt_br.py @@ -0,0 +1,137 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_5000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": + "top_5000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 1] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": True + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-2, -1, 0] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + }, + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": True, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + }, + "noise_reweight_factor": 1, + } + ] +} diff --git a/snips_inference_agl/default_configs/config_pt_pt.py b/snips_inference_agl/default_configs/config_pt_pt.py new file mode 100644 index 0000000..450f0db --- /dev/null +++ b/snips_inference_agl/default_configs/config_pt_pt.py @@ -0,0 +1,137 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "lookup_intent_parser", + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_5000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [-2, -1, 0, 1, 2] + }, + { + "args": { + "common_words_gazetteer_name": + "top_5000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [-2, 1] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [-1, 0, 1] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [-2, -1, 0] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [0, 1, 2] + }, + { + "args": {"n": 1}, + "factory_name": "shape_ngram", + "offsets": [0] + }, + { + "args": {"n": 2}, + "factory_name": "shape_ngram", + "offsets": [-1, 0] + }, + { + "args": {"n": 3}, + "factory_name": "shape_ngram", + "offsets": [-1] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": False + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2, + "entity_filter": { + "automatically_extensible": True + } + }, + "factory_name": "entity_match", + "offsets": [-2, -1, 0], + "drop_out": 0.5 + }, + { + "args": {"tagging_scheme_code": 1}, + "factory_name": "builtin_entity_match", + "offsets": [-2, -1, 0] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + }, + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": True, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + }, + "noise_reweight_factor": 1, + } + ] +} |