Merge pull request #350 from deeppavlov/dev

Release v0.17.0
deeppavlov · Mar 18, 2023 · aa44906 · aa44906
2 parents 7015110 + 69defbf
commit aa44906
Show file tree

Hide file tree

Showing 295 changed files with 11,671 additions and 8,496 deletions.
diff --git a/README.md b/README.md
@@ -260,21 +260,22 @@ Dream Architecture is presented in the following image:
 | Wiki Facts                  | 1.7 GB RAM             | model that extracts related facts from Wikipedia and WikiHow pages                                                                                                                                                             |
 
 ## Services
-| Name                   | Requirements            | Description                                                                                                                                                                                                                                |
-|------------------------|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| DialoGPT               | 1.2 GB RAM, 2.1 GB GPU  | generative service based on Transformers generative model, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (for example, `microsoft/DialoGPT-small` with 0.2-0.5 sec on GPU)                                   |
-| DialoGPT Persona-based | 1.2 GB RAM, 2.1 GB GPU  | generative service based on Transformers generative model, the model was pre-trained on the PersonaChat dataset to generate a response conditioned on a several sentences of the socialbot's persona                                       |
-| Image Captioning       | 4 GB RAM, 5.4 GB GPU    | creates text representation of a received image                                                                                                                                                                                            |
-| Infilling              | 1  GB RAM, 1.2 GB GPU   | (turned off but the code is available) generative service based on Infilling model, for the given utterance returns utterance where `_` from original text is replaced with generated tokens                                               |
-| Knowledge Grounding    | 2 GB RAM, 2.1 GB GPU    | generative service based on BlenderBot architecture providing a response to the context taking into account an additional text paragraph                                                                                                   |
-| Masked LM              | 1.1 GB RAM, 1 GB GPU    | (turned off but the code is available)                                                                                                                                                                                                     |
-| Seq2seq Persona-based  | 1.5 GB RAM, 1.5 GB GPU  | generative service based on Transformers seq2seq model, the model was pre-trained on the PersonaChat dataset to generate a response conditioned on a several sentences of the socialbot's persona                                          |
-| Sentence Ranker        | 1.2 GB RAM, 2.1 GB GPU  | ranking model given as `PRETRAINED_MODEL_NAME_OR_PATH` which for a pair os sentences returns a float score of correspondence                                                                                                               |
-| StoryGPT               | 2.6 GB RAM, 2.15 GB GPU | generative service based on fine-tuned GPT-2, for the given set of keywords returns a short story using the keywords                                                                                                                       |
-| GPT-3.5                | 100 MB RAM              | generative service based on OpenAI API service, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (in particular, in this service, `text-davinci-003` is used.                                                   |
-| ChatGPT                | 100 MB RAM              | generative service based on OpenAI API service, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (in particular, in this service, `gpt-3.5-turbo` is used.                                                      |
-| Prompt StoryGPT        | 3 GB RAM, 4 GB GPU      | generative service based on fine-tuned GPT-2, for the given topic represented by one noun returns short story on a given topic                                                                                                             |
-| GPT-J 6B               | 1.5 GB RAM, 24.2 GB GPU | generative service based on Transformers generative model, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (in particular, in this service, [GPT-J model](https://huggingface.co/EleutherAI/gpt-j-6B) is used. |
+| Name                   | Requirements            | Description                                                                                                                                                                                                                                       |
+|------------------------|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| DialoGPT               | 1.2 GB RAM, 2.1 GB GPU  | generative service based on Transformers generative model, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (for example, `microsoft/DialoGPT-small` with 0.2-0.5 sec on GPU)                                          |
+| DialoGPT Persona-based | 1.2 GB RAM, 2.1 GB GPU  | generative service based on Transformers generative model, the model was pre-trained on the PersonaChat dataset to generate a response conditioned on a several sentences of the socialbot's persona                                              |
+| Image Captioning       | 4 GB RAM, 5.4 GB GPU    | creates text representation of a received image                                                                                                                                                                                                   |
+| Infilling              | 1  GB RAM, 1.2 GB GPU   | (turned off but the code is available) generative service based on Infilling model, for the given utterance returns utterance where `_` from original text is replaced with generated tokens                                                      |
+| Knowledge Grounding    | 2 GB RAM, 2.1 GB GPU    | generative service based on BlenderBot architecture providing a response to the context taking into account an additional text paragraph                                                                                                          |
+| Masked LM              | 1.1 GB RAM, 1 GB GPU    | (turned off but the code is available)                                                                                                                                                                                                            |
+| Seq2seq Persona-based  | 1.5 GB RAM, 1.5 GB GPU  | generative service based on Transformers seq2seq model, the model was pre-trained on the PersonaChat dataset to generate a response conditioned on a several sentences of the socialbot's persona                                                 |
+| Sentence Ranker        | 1.2 GB RAM, 2.1 GB GPU  | ranking model given as `PRETRAINED_MODEL_NAME_OR_PATH` which for a pair os sentences returns a float score of correspondence                                                                                                                      |
+| StoryGPT               | 2.6 GB RAM, 2.15 GB GPU | generative service based on fine-tuned GPT-2, for the given set of keywords returns a short story using the keywords                                                                                                                              |
+| GPT-3.5                | 100 MB RAM              | generative service based on OpenAI API service, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (in particular, in this service, `text-davinci-003` is used.                                                          |
+| ChatGPT                | 100 MB RAM              | generative service based on OpenAI API service, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (in particular, in this service, `gpt-3.5-turbo` is used.                                                             |
+| Prompt StoryGPT        | 3 GB RAM, 4 GB GPU      | generative service based on fine-tuned GPT-2, for the given topic represented by one noun returns short story on a given topic                                                                                                                    |
+| GPT-J 6B               | 1.5 GB RAM, 24.2 GB GPU | generative service based on Transformers generative model, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (in particular, in this service, [GPT-J model](https://huggingface.co/EleutherAI/gpt-j-6B) is used.        |
+| BLOOMZ 7B              | 2.5 GB RAM, 29 GB GPU   | generative service based on Transformers generative model, the model is set in docker compose argument `PRETRAINED_MODEL_NAME_OR_PATH` (in particular, in this service, [BLOOMZ-7b1 model](https://huggingface.co/bigscience/bloomz-7b1) is used. |
 
 ## Skills
 | Name                               | Requirements              | Description                                                                                                                                                                                                                                                   |

diff --git a/annotators/BadlistedWordsDetector/component.yml b/annotators/BadlistedWordsDetector/component.yml
@@ -0,0 +1,23 @@
+badlisted-words:
+  name: badlisted_words
+  display_name: Bad Words Detection
+  container_name: badlisted-words
+  component_type: null
+  model_type: Dictionary/Pattern-based
+  is_customizable: false
+  author: DeepPavlov
+  description: detects words and phrases from the badlist
+  ram_usage: 150M
+  gpu_usage: null
+  port: 8018
+  endpoints:
+  - group: annotators
+    endpoint: badlisted_words
+  - group: candidate_annotators
+    endpoint: badlisted_words_batch
+  - group: services
+    endpoint: badlisted_words_batch
+  build_args:
+    SERVICE_PORT: 8018
+    SERVICE_NAME: badlisted_words
+  date_created: '2023-03-16T09:45:32'
diff --git a/annotators/BadlistedWordsDetector/pipeline.yml b/annotators/BadlistedWordsDetector/pipeline.yml
@@ -0,0 +1,21 @@
+badlisted-words:
+- group: annotators
+  connector:
+    protocol: http
+    timeout: 1.0
+    url: http://badlisted-words:8018/badlisted_words
+  dialog_formatter: state_formatters.dp_formatters:preproc_last_human_utt_dialog
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  previous_services:
+  - annotators.spelling_preprocessing
+  state_manager_method: add_annotation
+- group: candidate_annotators
+  connector:
+    protocol: http
+    timeout: 1.0
+    url: http://badlisted-words:8018/badlisted_words_batch
+  dialog_formatter: state_formatters.dp_formatters:hypotheses_list
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  previous_services:
+  - skills
+  state_manager_method: add_hypothesis_annotation_batch
diff --git a/annotators/BadlistedWordsDetector_ru/component.yml b/annotators/BadlistedWordsDetector_ru/component.yml
@@ -0,0 +1,23 @@
+badlisted-words-ru:
+  name: badlisted_words
+  display_name: Bad Words Detection
+  container_name: badlisted-words-ru
+  component_type: null
+  model_type: Dictionary/Pattern-based
+  is_customizable: false
+  author: DeepPavlov
+  description: detects obscene Russian words from the badlist
+  ram_usage: 50M
+  gpu_usage: null
+  port: 8018
+  endpoints:
+  - group: annotators
+    endpoint: badlisted_words
+  - group: candidate_annotators
+    endpoint: badlisted_words_batch
+  - group: services
+    endpoint: badlisted_words_batch
+  build_args:
+    SERVICE_PORT: 8018
+    SERVICE_NAME: badlisted_words
+  date_created: '2023-03-16T09:45:32'
diff --git a/annotators/BadlistedWordsDetector_ru/pipeline.yml b/annotators/BadlistedWordsDetector_ru/pipeline.yml
@@ -0,0 +1,12 @@
+badlisted-words-ru:
+- group: annotators
+  connector:
+    protocol: http
+    timeout: 1.0
+    url: http://badlisted-words-ru:8018/badlisted_words
+  dialog_formatter: state_formatters.dp_formatters:preproc_and_tokenized_last_human_utt_dialog
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  previous_services:
+  - annotators.spelling_preprocessing
+  - annotators.spacy_annotator
+  state_manager_method: add_annotation
diff --git a/annotators/COMeT/component.yml b/annotators/COMeT/component.yml
@@ -0,0 +1,50 @@
+comet-atomic:
+  name: comet_atomic
+  display_name: COMeT Atomic
+  container_name: comet-atomic
+  component_type: null
+  model_type: NN-based
+  is_customizable: false
+  author: DeepPavlov
+  description: Commonsense prediction models COMeT Atomic
+  ram_usage: 2G
+  gpu_usage: 1.1G
+  port: 8053
+  endpoints:
+  - group: services
+    endpoint: comet
+  build_args:
+    GRAPH: atomic
+    SERVICE_HOME: ./annotators/COMeT
+    SERVICE_NAME: comet_atomic
+    SERVICE_PORT: 8053
+    PRETRAINED_MODEL: http://lnsigo.mipt.ru/export/alexaprize_data/comet/atomic_pretrained_model.pickle
+    PREPROCESS_DATA: http://lnsigo.mipt.ru/export/alexaprize_data/comet/categories_oEffect%23oReact%23oWant%23xAttr%23xEffect%23xIntent%23xNeed%23xReact%23xWant-maxe1_17-maxe2_35-maxr_1.pickle
+    DECODING_ALGO: beam-3
+  date_created: '2023-03-16T09:45:32'
+comet-conceptnet:
+  name: comet_conceptnet
+  display_name: COMeT ConceptNet
+  container_name: comet-conceptnet
+  component_type: null
+  model_type: NN-based
+  is_customizable: false
+  author: DeepPavlov
+  description: Commonsense prediction models COMeT  ConceptNet
+  ram_usage: 2G
+  gpu_usage: 1.1G
+  port: 8065
+  endpoints:
+  - group: annotators
+    endpoint: comet_annotator
+  - group: services
+    endpoint: comet
+  build_args:
+    GRAPH: conceptnet
+    SERVICE_HOME: ./annotators/COMeT/
+    SERVICE_NAME: comet_conceptnet
+    SERVICE_PORT: 8065
+    PRETRAINED_MODEL: http://lnsigo.mipt.ru/export/alexaprize_data/conceptnet/conceptnet_pretrained_model.pickle
+    PREPROCESS_DATA: http://lnsigo.mipt.ru/export/alexaprize_data/conceptnet/rel_language-trainsize_100-devversion_12-maxe1_10-maxe2_15-maxr_5.pickle
+    DECODING_ALGO: beam-3
+  date_created: '2023-03-16T09:45:32'
diff --git a/annotators/COMeT/pipeline.yml b/annotators/COMeT/pipeline.yml
@@ -0,0 +1,13 @@
+comet-conceptnet:
+- group: annotators
+  connector:
+    protocol: http
+    timeout: 2.0
+    url: http://comet-conceptnet:8065/comet_annotator
+  dialog_formatter: state_formatters.dp_formatters:last_human_utt_nounphrases
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  previous_services:
+  - annotators.spelling_preprocessing
+  - annotators.entity_detection
+  - annotators.spacy_nounphrases
+  state_manager_method: add_annotation
diff --git a/annotators/ConversationEvaluator/component.yml b/annotators/ConversationEvaluator/component.yml
@@ -0,0 +1,23 @@
+convers-evaluator-annotator:
+  name: convers_evaluator_annotator
+  display_name: Conversation Evaluator
+  container_name: convers-evaluator-annotator
+  component_type: null
+  model_type: NN-based
+  is_customizable: false
+  author: DeepPavlov
+  description: is trained on the Alexa Prize data from the previous competitions and
+    predicts whether the candidate response is interesting, comprehensible, on-topic,
+    engaging, or erroneous
+  ram_usage: 1G
+  gpu_usage: 4.5G
+  port: 8004
+  endpoints:
+  - group: candidate_annotators
+    endpoint: batch_model
+  build_args:
+    CONFIG: conveval.json
+    SERVICE_PORT: 8004
+    DATA_URL: https://files.deeppavlov.ai/alexaprize_data/cobot_conveval2.tar.gz
+    SERVICE_NAME: convers_evaluator_annotator
+  date_created: '2023-03-16T09:45:32'
diff --git a/annotators/ConversationEvaluator/pipeline.yml b/annotators/ConversationEvaluator/pipeline.yml
@@ -0,0 +1,11 @@
+convers-evaluator-annotator:
+- group: candidate_annotators
+  connector:
+    protocol: http
+    timeout: 2.0
+    url: http://convers-evaluator-annotator:8004/batch_model
+  dialog_formatter: state_formatters.dp_formatters:convers_evaluator_annotator_formatter
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  previous_services:
+  - skills
+  state_manager_method: add_hypothesis_annotation_batch
diff --git a/annotators/IntentCatcherTransformers/component.yml b/annotators/IntentCatcherTransformers/component.yml
@@ -0,0 +1,44 @@
+intent-catcher:
+  name: intent_catcher
+  display_name: Intent Catcher
+  container_name: intent-catcher
+  component_type: null
+  model_type: NN-based
+  is_customizable: true
+  author: DeepPavlov
+  description: classifies user utterances into a number of predefined intents which
+    are trained on a set of phrases and regexps
+  ram_usage: 1.7G
+  gpu_usage: 2.4G
+  port: 8014
+  endpoints:
+  - group: annotators
+    endpoint: detect
+  build_args:
+    SERVICE_PORT: 8014
+    CONFIG_NAME: intents_model_dp_config.json
+    INTENT_PHRASES_PATH: intent_phrases.json
+    SERVICE_NAME: intent_catcher
+  date_created: '2023-03-16T09:45:32'
+intent-catcher-ru:
+  name: intent_catcher
+  display_name: Intent Catcher
+  container_name: intent-catcher-ru
+  component_type: null
+  model_type: NN-based
+  is_customizable: true
+  author: DeepPavlov
+  description: classifies user utterances into a number of predefined intents which
+    are trained on a set of phrases and regexps
+  ram_usage: 900M
+  gpu_usage: null
+  port: 8014
+  endpoints:
+  - group: annotators
+    endpoint: detect
+  build_args:
+    SERVICE_PORT: 8014
+    CONFIG_NAME: intents_model_dp_config_RU.json
+    INTENT_PHRASES_PATH: intent_phrases_RU.json
+    SERVICE_NAME: intent_catcher
+  date_created: '2023-03-16T09:45:32'
diff --git a/annotators/IntentCatcherTransformers/pipeline.yml b/annotators/IntentCatcherTransformers/pipeline.yml
@@ -0,0 +1,24 @@
+intent-catcher:
+- group: annotators
+  connector:
+    protocol: http
+    timeout: 1.0
+    url: http://intent-catcher:8014/detect
+  dialog_formatter: state_formatters.dp_formatters:last_utt_sentseg_segments_dialog
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  previous_services:
+  - annotators.spelling_preprocessing
+  - annotators.sentseg
+  state_manager_method: add_annotation
+intent-catcher-ru:
+- group: annotators
+  connector:
+    protocol: http
+    timeout: 1.0
+    url: http://intent-catcher-ru:8014/detect
+  dialog_formatter: state_formatters.dp_formatters:last_utt_sentseg_segments_dialog
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  previous_services:
+  - annotators.spelling_preprocessing
+  - annotators.sentseg
+  state_manager_method: add_annotation
diff --git a/annotators/MultilingualSentimentClassification/component.yml b/annotators/MultilingualSentimentClassification/component.yml
@@ -0,0 +1,22 @@
+sentiment-classification-multilingual:
+  name: sentiment_classification
+  display_name: Sentiment Classification
+  container_name: sentiment-classification-multilingual
+  component_type: null
+  model_type: NN-based
+  is_customizable: false
+  author: DeepPavlov
+  description: classifies sentiment to positive, negative and neutral classes
+  ram_usage: 2G
+  gpu_usage: 2G
+  port: 8024
+  endpoints:
+  - group: annotators
+    endpoint: respond
+  - group: candidate_annotators
+    endpoint: respond_batch
+  build_args:
+    SERVICE_PORT: 8024
+    SERVICE_NAME: sentiment_classification
+    PRETRAINED_MODEL_NAME_OR_PATH: cardiffnlp/twitter-xlm-roberta-base-sentiment
+  date_created: '2023-03-16T09:45:32'
diff --git a/annotators/MultilingualSentimentClassification/pipeline.yml b/annotators/MultilingualSentimentClassification/pipeline.yml
@@ -0,0 +1,19 @@
+sentiment-classification-multilingual:
+- group: annotators
+  connector:
+    protocol: http
+    timeout: 2.0
+    url: http://sentiment-classification-multilingual:8024/respond
+  dialog_formatter: state_formatters.dp_formatters:preproc_last_human_utt_dialog
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  state_manager_method: add_annotation
+- group: candidate_annotators
+  connector:
+    protocol: http
+    timeout: 2.0
+    url: http://sentiment-classification-multilingual:8024/respond_batch
+  dialog_formatter: state_formatters.dp_formatters:hypotheses_list
+  response_formatter: state_formatters.dp_formatters:simple_formatter_service
+  previous_services:
+  - skills
+  state_manager_method: add_hypothesis_annotation_batch