From 79c1142b238ae56ce8d497bae039175bc66b1344 Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Mon, 14 Nov 2022 14:00:34 +0300 Subject: [PATCH 1/5] feat: upd ru dialogpt (#212) * feat: up ru dialogpt * fix: load in dockerfile * fix: newer version of transformers * fix: fix version of wiki-facts * fix: fixe dp version * fix: change spacy version * feat: wiki facts requirements * feat: upd ru dialogpt * feat: upd ru dialogpt * feat: upd dialogrpt version * fix: chose gpu * fix: chose gpu * feat: dialogpt * feat: dialogpt * feat: dialogpt * fix: generative skill timeout 4 sec * fix: generative skill timeout 4 sec * fix: generative skill tests * fix: codestyle * fix: tests * fix: codestyle * fix: codestyle --- README.md | 28 +- README_ru.md | 20 +- .../dream_russian/docker-compose.override.yml | 4 +- .../dream_russian/pipeline_conf.json | 2 +- assistant_dists/dream_russian/test.yml | 4 +- services/dialogpt_RU/Dockerfile | 3 + services/dialogpt_RU/requirements.txt | 2 +- services/dialogpt_RU/server.py | 227 +++++----- services/dialogpt_RU/test.py | 2 +- services/dialogrpt_ru/Dockerfile | 3 + services/dialogrpt_ru/feeder.py | 2 +- services/dialogrpt_ru/requirements.txt | 2 +- services/dialogrpt_ru/utils.py | 4 +- services/wiki_facts/Dockerfile | 4 +- services/wiki_facts/requirements.txt | 8 +- .../dff_generative_skill/scenario/response.py | 2 +- skills/dff_generative_skill/server.py | 2 +- skills/dff_generative_skill/test_server.py | 10 +- .../tests/lets_talk_in.json | 405 +++++++++++++++++- .../tests/lets_talk_out.json | 341 ++++++++++++++- 20 files changed, 894 insertions(+), 181 deletions(-) diff --git a/README.md b/README.md index 9857255393..087c3db3be 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ This is a generative-based socialbot that uses [English DialoGPT model](https:// ### Dream Russian -Russian version of DeepPavlov Dream Socialbot. This is a generative-based socialbot that uses [Russian DialoGPT model](https://huggingface.co/Grossmend/rudialogpt3_medium_based_on_gpt2) to generate most of the responses. It also contains intent catcher and responder components to cover special user requests. +Russian version of DeepPavlov Dream Socialbot. This is a generative-based socialbot that uses [Russian DialoGPT by DeepPavlov](https://huggingface.co/DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2) to generate most of the responses. It also contains intent catcher and responder components to cover special user requests. [Link to the distribution.](https://github.com/deeppavlov/dream/tree/main/assistant_dists/dream_russian) # Quick Start @@ -301,23 +301,23 @@ Dream Architecture is presented in the following image: ## Annotators -| Name | Requirements | Description | -|------------------------|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Badlisted words | 50 MiB RAM | detects obscene Russian words from the badlist | -| Entity detection | 3 GiB RAM | extracts entities and their types from utterances | -| Entity linking | 500 MiB RAM, ?? GiB GPU | finds Wikidata entity ids for the entities detected with Entity Detection | -| Intent catcher | 900 MiB RAM | classifies user utterances into a number of predefined intents which are trained on a set of phrases and regexps | -| NER | 1.7 GiB RAM, 4.9 Gib GPU | extracts person names, names of locations, organizations from uncased text using ruBert-based (pyTorch) model | -| Sentseg | 2.4 GiB RAM, 4.9 Gib GPU | recovers punctuation using ruBert-based (pyTorch) model and splits into sentences | -| Spacy Annotator | 250 MiB RAM | token-wise annotations by Spacy | -| Spelling preprocessing | 4.4 GiB RAM | Russian Levenshtein correction model | -| Wiki parser | 100 MiB RAM | extracts Wikidata triplets for the entities detected with Entity Linking | -| DialogRPT | 3.8 GiB RAM, 2 GiB GPU | DialogRPT model which is based on Russian DialoGPT (see https://huggingface.co/Grossmend/rudialogpt3_medium_based_on_gpt2) and fine-tuned on Russian Pikabu Comment sequences | +| Name | Requirements | Description | +|------------------------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Badlisted words | 50 MiB RAM | detects obscene Russian words from the badlist | +| Entity detection | 3 GiB RAM | extracts entities and their types from utterances | +| Entity linking | 500 MiB RAM, ?? GiB GPU | finds Wikidata entity ids for the entities detected with Entity Detection | +| Intent catcher | 900 MiB RAM | classifies user utterances into a number of predefined intents which are trained on a set of phrases and regexps | +| NER | 1.7 GiB RAM, 4.9 Gib GPU | extracts person names, names of locations, organizations from uncased text using ruBert-based (pyTorch) model | +| Sentseg | 2.4 GiB RAM, 4.9 Gib GPU | recovers punctuation using ruBert-based (pyTorch) model and splits into sentences | +| Spacy Annotator | 250 MiB RAM | token-wise annotations by Spacy | +| Spelling preprocessing | 4.4 GiB RAM | Russian Levenshtein correction model | +| Wiki parser | 100 MiB RAM | extracts Wikidata triplets for the entities detected with Entity Linking | +| DialogRPT | 3.8 GiB RAM, 2 GiB GPU | DialogRPT model which is based on [Russian DialoGPT by DeepPavlov](https://huggingface.co/DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2) and fine-tuned on Russian Pikabu Comment sequences | ## Skills & Services | Name | Requirements | Description | |------------------------|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| DialoGPT | 2.8 GiB RAM, 2 GiB GPU | Russian DialoGPT model https://huggingface.co/Grossmend/rudialogpt3_medium_based_on_gpt2 | +| DialoGPT | 2.8 GiB RAM, 2 GiB GPU | [Russian DialoGPT by DeepPavlov](https://huggingface.co/DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2) | | Dummy Skill | a part of agent container | a fallback skill with multiple non-toxic candidate responses and random Russian questions | | Personal Info skill | 40 MiB RAM | queries and stores user's name, birthplace, and location | | DFF Generative skill | 50 MiB RAM | **[New DFF version]** generative skill which uses DialoGPT service to generate 3 different hypotheses | diff --git a/README_ru.md b/README_ru.md index d403a43238..c6611f7839 100644 --- a/README_ru.md +++ b/README_ru.md @@ -59,7 +59,7 @@ Deepy GoBot Base содержит аннотатор исправления оп ### Dream Russian Русскоязычная версия DeepPavlov Dream Socialbot. Данная версия основана на нейросетевой генерации с использованием -[Russian DialoGPT модели](https://huggingface.co/Grossmend/rudialogpt3_medium_based_on_gpt2). +[Russian DialoGPT модели by DeepPavlov](https://huggingface.co/DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2). Дистрибутив также содержит компоненты для детектирования запросов пользователя и выдачи специальных ответов на них. [Link to the distribution.](https://github.com/deeppavlov/dream/tree/main/assistant_dists/dream_russian) @@ -199,15 +199,15 @@ docker-compose -f docker-compose.yml -f assistant_dists/dream/docker-compose.ove | DialogRPT | 3.9 GiB RAM, 2.2 GiB GPU | Сервис оценки вероятности реплики понравиться пользователю (updown) на основе ранжирующей модели DialogRPT, которая дообучена на основе генеративной модели Russian DialoGPT на комментариев с сайта Пикабу. | ## Навыки и Сервисы (Skills & Services) -| Name | Requirements | Description | -|----------------------|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| DialoGPT | 2.8 GiB RAM, 2.2 GiB GPU | Сервис генерации реплики по текстовому контексту диалога на основе предобученной модели Russian [DialoGPT](https://huggingface.co/Grossmend/rudialogpt3_medium_based_on_gpt2) | -| Dummy Skill | a part of agent container | Навык для генерации ответов-заглушек и выдачис лучайных вопросов из базы в каечстве linking-questions. | -| Personal Info Skill | 40 MiB RAM | Сценарный навык для извлечения и запоминания основной личной информации о пользователе. | -| DFF Generative Skill | 50 MiB RAM | **[New DFF version]** навык, выдающий 5 гипотез, выданных сервисом DialoGPT | -| DFF Intent Responder | 50 MiB RAM | **[New DFF version]** Сценарный навык на основе DFF для ответа на специальные намерения пользователя. | -| DFF Program Y Skill | 80 MiB RAM | **[New DFF version]** Сценарный навык на основе DFF для ответа на общие вопросы в виде AIML компоненты. | -| DFF Friendship Skill | 70 MiB RAM | **[New DFF version]** Сценарный навык на основе DFF приветственной части диалога с пользователем. | +| Name | Requirements | Description | +|----------------------|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| DialoGPT | 2.8 GiB RAM, 2.2 GiB GPU | Сервис генерации реплики по текстовому контексту диалога на основе предобученной модели [Russian DialoGPT by DeepPavlov](https://huggingface.co/DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2) | +| Dummy Skill | a part of agent container | Навык для генерации ответов-заглушек и выдачис лучайных вопросов из базы в каечстве linking-questions. | +| Personal Info Skill | 40 MiB RAM | Сценарный навык для извлечения и запоминания основной личной информации о пользователе. | +| DFF Generative Skill | 50 MiB RAM | **[New DFF version]** навык, выдающий 5 гипотез, выданных сервисом DialoGPT | +| DFF Intent Responder | 50 MiB RAM | **[New DFF version]** Сценарный навык на основе DFF для ответа на специальные намерения пользователя. | +| DFF Program Y Skill | 80 MiB RAM | **[New DFF version]** Сценарный навык на основе DFF для ответа на общие вопросы в виде AIML компоненты. | +| DFF Friendship Skill | 70 MiB RAM | **[New DFF version]** Сценарный навык на основе DFF приветственной части диалога с пользователем. | # Публикации diff --git a/assistant_dists/dream_russian/docker-compose.override.yml b/assistant_dists/dream_russian/docker-compose.override.yml index 2eaf1bbf13..db893626d2 100644 --- a/assistant_dists/dream_russian/docker-compose.override.yml +++ b/assistant_dists/dream_russian/docker-compose.override.yml @@ -317,7 +317,7 @@ services: context: ./services/dialogpt_RU/ args: SERVICE_PORT: 8091 - PRETRAINED_MODEL_NAME_OR_PATH: "Grossmend/rudialogpt3_medium_based_on_gpt2" + PRETRAINED_MODEL_NAME_OR_PATH: DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2 LANGUAGE: RU command: flask run -h 0.0.0.0 -p 8091 environment: @@ -354,7 +354,7 @@ services: args: SERVICE_PORT: 8122 PRETRAINED_MODEL_FNAME: dialogrpt_ru_ckpt_v0.pth - TOKENIZER_NAME_OR_PATH: "Grossmend/rudialogpt3_medium_based_on_gpt2" + TOKENIZER_NAME_OR_PATH: DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2 command: flask run -h 0.0.0.0 -p 8122 environment: - CUDA_VISIBLE_DEVICES=0 diff --git a/assistant_dists/dream_russian/pipeline_conf.json b/assistant_dists/dream_russian/pipeline_conf.json index 6319cee8df..dea7594f1c 100644 --- a/assistant_dists/dream_russian/pipeline_conf.json +++ b/assistant_dists/dream_russian/pipeline_conf.json @@ -300,7 +300,7 @@ "dff_generative_skill": { "connector": { "protocol": "http", - "timeout": 2, + "timeout": 4, "url": "http://dff-generative-skill:8092/respond" }, "dialog_formatter": "state_formatters.dp_formatters:dff_generative_skill_formatter", diff --git a/assistant_dists/dream_russian/test.yml b/assistant_dists/dream_russian/test.yml index 136107dbbb..40fb1a69dd 100644 --- a/assistant_dists/dream_russian/test.yml +++ b/assistant_dists/dream_russian/test.yml @@ -45,9 +45,9 @@ services: - CUDA_VISIBLE_DEVICES=7 dialogpt: environment: - - CUDA_VISIBLE_DEVICES=7 + - CUDA_VISIBLE_DEVICES=6 dialogrpt: environment: - - CUDA_VISIBLE_DEVICES=7 + - CUDA_VISIBLE_DEVICES=6 dff-template-skill: version: '3.7' diff --git a/services/dialogpt_RU/Dockerfile b/services/dialogpt_RU/Dockerfile index 608116d5d1..e9bf5741d5 100644 --- a/services/dialogpt_RU/Dockerfile +++ b/services/dialogpt_RU/Dockerfile @@ -17,6 +17,9 @@ RUN pip install -r /src/requirements.txt COPY . /src +RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${PRETRAINED_MODEL_NAME_OR_PATH}');" +RUN python -c "from transformers import AutoModelForCausalLM; AutoModelForCausalLM.from_pretrained('${PRETRAINED_MODEL_NAME_OR_PATH}');" + HEALTHCHECK --interval=5s --timeout=90s --retries=3 CMD curl --fail 127.0.0.1:${SERVICE_PORT}/healthcheck || exit 1 diff --git a/services/dialogpt_RU/requirements.txt b/services/dialogpt_RU/requirements.txt index d45fabd47e..cbb35a7294 100644 --- a/services/dialogpt_RU/requirements.txt +++ b/services/dialogpt_RU/requirements.txt @@ -1,4 +1,4 @@ -transformers==4.0.1 +transformers==4.11.0 sentencepiece==0.1.94 flask==1.1.1 gunicorn==19.9.0 diff --git a/services/dialogpt_RU/server.py b/services/dialogpt_RU/server.py index c001a19c4f..fc0a70e627 100644 --- a/services/dialogpt_RU/server.py +++ b/services/dialogpt_RU/server.py @@ -1,10 +1,7 @@ -""" -Source code is https://github.com/Grossmend/DialoGPT/blob/master/src/service/service.py -""" import logging import time import os -from typing import Dict, List +import random from transformers import AutoTokenizer, AutoModelForCausalLM import torch @@ -20,7 +17,7 @@ logger = logging.getLogger(__name__) PRETRAINED_MODEL_NAME_OR_PATH = os.environ.get( - "PRETRAINED_MODEL_NAME_OR_PATH", "Grossmend/rudialogpt3_medium_based_on_gpt2" + "PRETRAINED_MODEL_NAME_OR_PATH", "DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2" ) logger.info(f"PRETRAINED_MODEL_NAME_OR_PATH = {PRETRAINED_MODEL_NAME_OR_PATH}") @@ -31,127 +28,165 @@ else: device = "cpu" +try: + tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH) + model = AutoModelForCausalLM.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH).to(device) + model.eval() + + logger.info("dialogpt model is ready") +except Exception as e: + sentry_sdk.capture_exception(e) + logger.exception(e) + raise e + logger.info(f"dialogpt is set to run on {device}") +SHORT_UTTERANCE_PROBA = 0.7 +CONTEXT_DEPTH = 3 + params_default = { - "max_length": 256, + "max_length": 128, "no_repeat_ngram_size": 3, "do_sample": True, - "top_k": 100, + "top_k": 20, "top_p": 0.9, - "temperature": 0.6, + "temperature": 0.7, "num_return_sequences": 3, "device": device, "is_always_use_length": True, - "length_generate": "1", } -class RussianDialogGPT: - def __init__(self, path_model: str): - self.path_model = path_model - self.tokenizer = None - self.model = None - self._load_model() - - def _load_model(self): - logger.info(f"dialogpt Loading model: {self.path_model} ...") - self.tokenizer = AutoTokenizer.from_pretrained(self.path_model) - self.model = AutoModelForCausalLM.from_pretrained(self.path_model) - - def get_responses(self, inputs: List[Dict], params: Dict) -> List[str]: - - params_ = { - "max_length": params.get("max_length", params_default["max_length"]), - "no_repeat_ngram_size": params.get("no_repeat_ngram_size", params_default["no_repeat_ngram_size"]), - "do_sample": params.get("do_sample", params_default["do_sample"]), - "top_k": params.get("top_k", params_default["top_k"]), - "top_p": params.get("top_p", params_default["top_p"]), - "temperature": params.get("temperature", params_default["temperature"]), - "num_return_sequences": params.get("num_return_sequences", params_default["num_return_sequences"]), - "device": params.get("device", params_default["device"]), - "is_always_use_length": params.get("is_always_use_length", params_default["is_always_use_length"]), - "length_generate": params.get("length_generate", params_default["length_generate"]), - } - - inputs_text = "" - for input_ in inputs: - if params_["is_always_use_length"]: - length_rep = len(self.tokenizer.encode(input_["text"])) - if length_rep <= 15: - length_param = "1" - elif length_rep <= 50: - length_param = "2" - elif length_rep <= 256: - length_param = "3" - else: - length_param = "-" - else: - length_param = "-" - inputs_text += f"|{input_['speaker']}|{length_param}|{input_['text']}" - inputs_text += f"|1|{params_['length_generate']}|" - - inputs_token_ids = self.tokenizer.encode(inputs_text, return_tensors="pt") - inputs_token_ids = inputs_token_ids.cuda() if cuda else inputs_token_ids - - try: - outputs_token_ids = self.model.generate( - inputs_token_ids, - max_length=params_["max_length"], - no_repeat_ngram_size=params_["no_repeat_ngram_size"], - do_sample=params_["do_sample"], - top_k=params_["top_k"], - top_p=params_["top_p"], - temperature=params_["temperature"], - num_return_sequences=params_["num_return_sequences"], - device=params_["device"], - mask_token_id=self.tokenizer.mask_token_id, - eos_token_id=self.tokenizer.eos_token_id, - unk_token_id=self.tokenizer.unk_token_id, - pad_token_id=self.tokenizer.pad_token_id, - ) - except Exception as e: - logger.info(f"dialogpt Error generate: {str(e)}") - return "" - - outputs = [self.tokenizer.decode(x, skip_special_tokens=True) for x in outputs_token_ids] - outputs = [x.split("|")[-1] for x in outputs] - # outputs contains list of strings of possible hypotheses - return outputs - +def inputs_by_length(input_: dict, length_rep=None): + if length_rep is None: + length_rep = len(tokenizer.encode(input_["text"])) + if params_default["is_always_use_length"]: + if length_rep <= 15: + length_param = "1" + elif length_rep <= 50: + length_param = "2" + elif length_rep <= 256: + length_param = "3" + else: + length_param = "-" + else: + length_param = "-" + return f"|{input_['speaker']}|{length_param}|{input_['text']}" + + +def format_dialogue_with_target(context, context_lengths, context_depth=3, encode=False, tokenizer=None): + """ + THE LAST UTTERANCE IN THE CONTEXT IS TARGET BOT'S UTTERANCE + + context: List(dict) + context = [ + {"text": "speaker": "human"}, + {"text": "hi there", "speaker": "bot"}, + {"text": "how are you", "speaker": "human"}, + {"text": "great how are u", "speaker": "bot"}, + ] + OR + context = [ + "hi", + "hi there", + "how are you", + "great how are u" + ] + """ + if len(context) > 0 and isinstance(context[0], str): + context_len = len(context) + # the last uttr is from BOT + inputs = [{"text": uttr, "speaker": (context_len - uttr_id) % 2} for uttr_id, uttr in enumerate(context)] + inputs = inputs[-context_depth:] + else: + inputs = [{"text": uttr["text"], "speaker": 1 if uttr["speaker"] == "bot" else 0} for uttr in context] + inputs = inputs[-context_depth:] + + inputs_text = "".join([inputs_by_length(input_, inp_len) for input_, inp_len in zip(inputs, context_lengths)]) + + if encode: + # if encode, return encoded context + inputs_token_ids = tokenizer.encode(inputs_text, return_tensors="pt") + return inputs_token_ids + + return inputs_text + + +def format_dialogue_for_inference(context, context_depth=4, encode=False, tokenizer=None): + """ + THE LAST UTTERANCE IN THE CONTEXT IS TARGET HUMAN'S UTTERANCE + + context: List(dict) + context = [ + {"text": "speaker": "human"}, + {"text": "hi there", "speaker": "bot"}, + {"text": "how are you", "speaker": "human"}, + ] + OR + context = [ + "hi", + "hi there", + "how are you", + ] + """ + if len(context) > 0 and isinstance(context[0], str): + context_len = len(context) + # the last uttr is from HUMAN + inputs = [{"text": uttr, "speaker": (context_len - uttr_id - 1) % 2} for uttr_id, uttr in enumerate(context)] + inputs = inputs[-context_depth:] + else: + inputs = [{"text": uttr["text"], "speaker": 1 if uttr["speaker"] == "bot" else 0} for uttr in context] + inputs = inputs[-context_depth:] + + inputs_text = "".join([inputs_by_length(input_) for input_ in inputs]) + length = "2" if random.uniform(0, 1) > SHORT_UTTERANCE_PROBA else "1" + inputs_text += f"|1|{length}|" + + if encode: + # if encode, return encoded context + inputs_token_ids = tokenizer.encode(inputs_text, return_tensors="pt") + return inputs_token_ids + + return inputs_text -try: - model = RussianDialogGPT(PRETRAINED_MODEL_NAME_OR_PATH) - model.model.eval() - if cuda: - model.model.cuda() - - logger.info("dialogpt model is ready") -except Exception as e: - sentry_sdk.capture_exception(e) - logger.exception(e) - raise e app = Flask(__name__) health = HealthCheck(app, "/healthcheck") logging.getLogger("werkzeug").setLevel("WARNING") +def generate(context, num_return_sequences, context_depth): + bot_input_ids = format_dialogue_for_inference( + context, context_depth=context_depth, encode=True, tokenizer=tokenizer + ) + bot_input_ids = bot_input_ids.to(device) + params_default["num_return_sequences"] = num_return_sequences + + chat_history_ids = model.generate(bot_input_ids, pad_token_id=tokenizer.eos_token_id, **params_default) + resp_tokens = chat_history_ids[:, bot_input_ids.shape[-1] :] + outputs = [tokenizer.decode(x, skip_special_tokens=True) for x in resp_tokens] + outputs = [x.split("|")[0] for x in outputs] + + return outputs + + @app.route("/respond", methods=["POST"]) def respond(): st_time = time.time() dialog_contexts = request.json.get("dialog_contexts", []) - num_return_sequences = request.json.get("num_return_sequences", 5) + num_return_sequences = request.json.get("num_return_sequences", 3) try: batch_generated_responses = [] for context in dialog_contexts: # context is a list of dicts, each dict contains text and speaker label # context = [{"text": "utterance text", "speaker": "human"}, ...] - inputs = [{"text": uttr["text"], "speaker": 1 if uttr["speaker"] == "bot" else 0} for uttr in context][-3:] - logger.info(f"dialogpt inputs: {inputs}") - hypotheses = model.get_responses(inputs, params={"num_return_sequences": num_return_sequences}) + logger.info(f"dialogpt inputs: {context[-CONTEXT_DEPTH:]}") + + hypotheses = generate( + context[-CONTEXT_DEPTH:], num_return_sequences=num_return_sequences, context_depth=CONTEXT_DEPTH + ) logger.info(f"dialogpt hypotheses: {hypotheses}") batch_generated_responses.append(hypotheses) diff --git a/services/dialogpt_RU/test.py b/services/dialogpt_RU/test.py index 16963d29c1..405f5bbeb3 100644 --- a/services/dialogpt_RU/test.py +++ b/services/dialogpt_RU/test.py @@ -12,7 +12,7 @@ def test_respond(): ] ] - request_data = {"dialog_contexts": dialog_contexts} + request_data = {"dialog_contexts": dialog_contexts, "num_return_sequences": 5} result = requests.post(url, json=request_data).json()["generated_responses"][0] assert len(result) == 5 and len(result[0]) > 0, f"Got\n{result}" diff --git a/services/dialogrpt_ru/Dockerfile b/services/dialogrpt_ru/Dockerfile index 8bf1368a9e..a84ed56c73 100644 --- a/services/dialogrpt_ru/Dockerfile +++ b/services/dialogrpt_ru/Dockerfile @@ -22,5 +22,8 @@ RUN pip install -r /src/requirements.txt COPY . /src +RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${TOKENIZER_NAME_OR_PATH}');" +RUN python -c "from transformers import AutoModelForCausalLM; AutoModelForCausalLM.from_pretrained('${TOKENIZER_NAME_OR_PATH}');" + CMD gunicorn --workers=1 server:app -b 0.0.0.0:${SERVICE_PORT} --timeout=300 diff --git a/services/dialogrpt_ru/feeder.py b/services/dialogrpt_ru/feeder.py index 7558ef24c6..3a68c87708 100644 --- a/services/dialogrpt_ru/feeder.py +++ b/services/dialogrpt_ru/feeder.py @@ -6,7 +6,7 @@ import torch from transformers import AutoTokenizer -TOKENIZER_NAME_OR_PATH = os.getenv("TOKENIZER_NAME_OR_PATH", "Grossmend/rudialogpt3_medium_based_on_gpt2") +TOKENIZER_NAME_OR_PATH = os.getenv("TOKENIZER_NAME_OR_PATH", "DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2") class Feeder: diff --git a/services/dialogrpt_ru/requirements.txt b/services/dialogrpt_ru/requirements.txt index 071abd5ca0..f16bc9958c 100644 --- a/services/dialogrpt_ru/requirements.txt +++ b/services/dialogrpt_ru/requirements.txt @@ -1,4 +1,4 @@ -transformers==4.0.1 +transformers==4.11.0 sentencepiece==0.1.94 flask==1.1.1 gunicorn==19.9.0 diff --git a/services/dialogrpt_ru/utils.py b/services/dialogrpt_ru/utils.py index 84ce6b8178..a49450c31e 100644 --- a/services/dialogrpt_ru/utils.py +++ b/services/dialogrpt_ru/utils.py @@ -6,7 +6,7 @@ EOS_token = "<|endoftext|>" -TOKENIZER_NAME_OR_PATH = os.getenv("TOKENIZER_NAME_OR_PATH", "Grossmend/rudialogpt3_medium_based_on_gpt2") +TOKENIZER_NAME_OR_PATH = os.getenv("TOKENIZER_NAME_OR_PATH", "DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2") class Option: @@ -173,7 +173,7 @@ class Scorer(ScorerBase): def __init__(self, opt): super().__init__(opt) n_embd = 1024 - self.transformer = AutoModelForCausalLM.from_pretrained("Grossmend/rudialogpt3_medium_based_on_gpt2") + self.transformer = AutoModelForCausalLM.from_pretrained(TOKENIZER_NAME_OR_PATH) self.transformer.resize_token_embeddings(len(self.tokenizer)) self.score = torch.nn.Linear(n_embd, 1, bias=False) diff --git a/services/wiki_facts/Dockerfile b/services/wiki_facts/Dockerfile index 915c0c8595..86051a0b89 100644 --- a/services/wiki_facts/Dockerfile +++ b/services/wiki_facts/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.7.6 +FROM deeppavlov/base-gpu:0.17.2 +RUN pip install --upgrade pip && pip install git+https://github.com/deeppavlov/DeepPavlov.git@0.17.2 COPY $SRC_DIR /src WORKDIR /src @@ -19,7 +20,6 @@ RUN pip install -r requirements.txt COPY services/${SERVICE_NAME}/ ./ COPY ./common/ ./common/ -RUN pip install deeppavlov RUN python -m deeppavlov install $CONFIG RUN python -m spacy download en_core_web_sm diff --git a/services/wiki_facts/requirements.txt b/services/wiki_facts/requirements.txt index 26c23ce018..3b916aa9b7 100644 --- a/services/wiki_facts/requirements.txt +++ b/services/wiki_facts/requirements.txt @@ -1,9 +1,11 @@ -sentry-sdk[flask]==0.14.1 flask==1.1.1 itsdangerous==2.0.1 -gunicorn==19.9.0 +gunicorn==20.0.4 +sentry-sdk==0.13.4 requests==2.22.0 +spacy==3.2.0 jinja2<=3.0.3 Werkzeug<=2.0.3 +cryptography==2.8 inflect==5.3.0 -spacy==3.0.6 +blinker==1.5.0 \ No newline at end of file diff --git a/skills/dff_generative_skill/scenario/response.py b/skills/dff_generative_skill/scenario/response.py index ebf5fda1d1..de1f21aef5 100644 --- a/skills/dff_generative_skill/scenario/response.py +++ b/skills/dff_generative_skill/scenario/response.py @@ -52,7 +52,7 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr): request_data = compose_data_for_dialogpt(ctx, actor) if len(request_data) > 0: - response = requests.post(DIALOGPT_SERVICE_URL, json={"dialog_contexts": [request_data]}, timeout=1.8) + response = requests.post(DIALOGPT_SERVICE_URL, json={"dialog_contexts": [request_data]}, timeout=3.8) hypotheses = response.json()["generated_responses"][0] else: hypotheses = [] diff --git a/skills/dff_generative_skill/server.py b/skills/dff_generative_skill/server.py index 3d8471361c..58a0220a1e 100644 --- a/skills/dff_generative_skill/server.py +++ b/skills/dff_generative_skill/server.py @@ -37,7 +37,7 @@ def is_container_running(): try: requested_data = [{"speaker": "human", "text": "привет"}] - response = requests.post(DIALOGPT_SERVICE_URL, json={"dialog_contexts": [requested_data]}, timeout=1) + response = requests.post(DIALOGPT_SERVICE_URL, json={"dialog_contexts": [requested_data]}, timeout=4) if response.status_code == 200: return True except Exception as exc: diff --git a/skills/dff_generative_skill/test_server.py b/skills/dff_generative_skill/test_server.py index 5ceb78f9ef..66e7ef711f 100644 --- a/skills/dff_generative_skill/test_server.py +++ b/skills/dff_generative_skill/test_server.py @@ -10,7 +10,7 @@ def handler(requested_data, random_seed): - hypothesis = requests.post(URL, json={**requested_data, "random_seed": random_seed}).json() + hypothesis = requests.post(URL, json={**requested_data, "random_seed": random_seed}, timeout=4).json() return hypothesis @@ -18,9 +18,11 @@ def run_test(handler): in_data, out_data = test_utils.get_dataset() for test_name in in_data: hypothesis = handler(in_data[test_name], RANDOM_SEED) - print(f"test name: {test_name}") - is_equal_flag, msg = test_utils.compare_structs(out_data[test_name], hypothesis, ignored_keys=["id"]) - if msg and len(msg.split("`")) == 5: + # do not compare first elements of the structs - generated texts + is_equal_flag, msg = test_utils.compare_structs( + out_data[test_name][1:], hypothesis[1:], ignored_keys=["id", "responses"] + ) + if msg and len(msg.split("`")) == 3: _, ground_truth_text, _, hypothesis_text, _ = msg.split("`") is_equal_flag, ratio = test_utils.compare_text(ground_truth_text, hypothesis_text, 0.0) if not is_equal_flag: diff --git a/skills/dff_generative_skill/tests/lets_talk_in.json b/skills/dff_generative_skill/tests/lets_talk_in.json index 3c3a4e4860..b6c42136f1 100644 --- a/skills/dff_generative_skill/tests/lets_talk_in.json +++ b/skills/dff_generative_skill/tests/lets_talk_in.json @@ -1,43 +1,417 @@ { "human_utter_index_batch": [ - 0 + 1 ], "dialog_batch": [ { "human_utterances": [ { - "text": "привет! как дела?", + "text": "привет.", + "annotations": { + "spelling_preprocessing": "привет!", + "spacy_annotator": [ + { + "dep_": "ROOT", + "ent_iob_": "O", + "ent_type_": "", + "lemma_": "привет", + "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", + "pos_": "NOUN", + "text": "привет" + }, + { + "dep_": "punct", + "ent_iob_": "O", + "ent_type_": "", + "lemma_": "!", + "morph": "", + "pos_": "PUNCT", + "text": "!" + } + ], + "badlisted_words": { + "bad_words": false + }, + "sentseg": { + "punct_sent": "привет.", + "segments": [ + "привет." + ] + }, + "toxic_classification": { + "toxic": 0.0015796684892848134 + }, + "entity_detection": {}, + "ner": [ + [] + ], + "intent_catcher": { + "choose_topic": { + "confidence": 0.0, + "detected": 0 + }, + "exit": { + "confidence": 0.001, + "detected": 0 + }, + "lets_chat_about": { + "confidence": 0.0, + "detected": 0 + }, + "no": { + "confidence": 0.0, + "detected": 0 + }, + "repeat": { + "confidence": 0.0, + "detected": 0 + }, + "topic_switching": { + "confidence": 0.0, + "detected": 0 + }, + "what_are_you_talking_about": { + "confidence": 0.0, + "detected": 0 + }, + "what_can_you_do": { + "confidence": 0.0, + "detected": 0 + }, + "what_is_your_job": { + "confidence": 0.0, + "detected": 0 + }, + "what_is_your_name": { + "confidence": 0.0, + "detected": 0 + }, + "where_are_you_from": { + "confidence": 0.002, + "detected": 0 + }, + "who_made_you": { + "confidence": 0.0, + "detected": 0 + }, + "yes": { + "confidence": 0.021, + "detected": 0 + } + }, + "entity_linking": [], + "wiki_parser": { + "animals_skill_entities_info": {}, + "entities_info": {}, + "topic_skill_entities_info": {}, + "utt_num": 1, + "wiki_skill_entities_info": {} + } + }, "user": { + "id": "844b675cd88042be942c2c4e789e96d0", + "user_external_id": "dfgkjhdfkgf", + "persona": {}, + "profile": { + "name": null, + "gender": null, + "birthdate": null, + "location": null, + "home_coordinates": null, + "work_coordinates": null, + "occupation": null, + "income_per_year": null + }, + "attributes": {}, "user_type": "human" - }, - "annotations": {} + } }, { - "text": "тоже хорошо. посоветуй мне фильм посмотреть.", + "text": "посоветуй мне фильм.", + "annotations": { + "spacy_annotator": [ + { + "dep_": "ROOT", + "ent_iob_": "O", + "ent_type_": "", + "lemma_": "посоветуй", + "morph": "Degree=Pos|Gender=Masc|Number=Sing|StyleVariant=Short", + "pos_": "ADJ", + "text": "посоветуй" + }, + { + "dep_": "iobj", + "ent_iob_": "O", + "ent_type_": "", + "lemma_": "мне", + "morph": "Case=Dat|Number=Sing|Person=First", + "pos_": "PRON", + "text": "мне" + }, + { + "dep_": "obj", + "ent_iob_": "O", + "ent_type_": "", + "lemma_": "фильм", + "morph": "Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing", + "pos_": "NOUN", + "text": "фильм" + } + ], + "spelling_preprocessing": "посоветуй мне фильм", + "badlisted_words": { + "bad_words": false + }, + "toxic_classification": { + "toxic": 0.008798949420452118 + }, + "sentseg": { + "punct_sent": "посоветуй мне фильм.", + "segments": [ + "посоветуй мне фильм." + ] + }, + "entity_detection": {}, + "ner": [ + [] + ], + "intent_catcher": { + "choose_topic": { + "confidence": 0.0, + "detected": 0 + }, + "exit": { + "confidence": 0.0, + "detected": 0 + }, + "lets_chat_about": { + "confidence": 0.003, + "detected": 0 + }, + "no": { + "confidence": 0.0, + "detected": 0 + }, + "repeat": { + "confidence": 0.0, + "detected": 0 + }, + "topic_switching": { + "confidence": 0.0, + "detected": 0 + }, + "what_are_you_talking_about": { + "confidence": 0.0, + "detected": 0 + }, + "what_can_you_do": { + "confidence": 0.0, + "detected": 0 + }, + "what_is_your_job": { + "confidence": 0.0, + "detected": 0 + }, + "what_is_your_name": { + "confidence": 0.0, + "detected": 0 + }, + "where_are_you_from": { + "confidence": 0.0, + "detected": 0 + }, + "who_made_you": { + "confidence": 0.0, + "detected": 0 + }, + "yes": { + "confidence": 0.0, + "detected": 0 + } + }, + "entity_linking": [], + "wiki_parser": { + "animals_skill_entities_info": {}, + "entities_info": {}, + "topic_skill_entities_info": {}, + "utt_num": 2, + "wiki_skill_entities_info": {} + } + }, "user": { + "id": "636e5e9d7fd1d167c75f8b2e", + "user_external_id": "dfgkjhdfkgf", + "persona": {}, + "profile": { + "name": null, + "gender": null, + "birthdate": null, + "location": null, + "home_coordinates": null, + "work_coordinates": null, + "occupation": null, + "income_per_year": null + }, + "attributes": { + "disliked_skills": [], + "age_group": "", + "dff_generative_skill_state": { + "context": { + "actor_state": {}, + "id": "7c92d5fe-bdb3-45ce-984e-5b00e0dab8bb", + "labels": { + "0": [ + "generation", + "generative_response_node" + ] + }, + "misc": {}, + "requests": { + "0": "привет." + }, + "responses": { + "0": [ + [ + "А то я уже отчаялся найти вас в своей группе и даже на этом подсайте.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "А где у меня написано, что я на полном серьезе пишу? Я не говорю, что надо всем, но надо.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "И, что характерно, \" не так всё плохо \" — это, оказывается, \" плохо \".", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ] + ] + }, + "validation": false + }, + "current_turn_dff_suspended": false, + "history": { + "0": [ + "generation", + "generative_response_node" + ] + }, + "previous_human_utter_index": 0, + "shared_memory": {} + }, + "dff_shared_state": { + "cross_links": {}, + "cross_states": {} + }, + "used_links": {} + }, "user_type": "human" - }, - "annotations": {} + } } ], "bot_utterances": [ { - "text": "отлично. а твои как?", + "text": "Привет, это чат-бот Dream! А где у меня написано, что я на полном серьезе пишу? Я не говорю, что надо всем, но надо.", + "annotations": { + "sentseg": { + "punct_sent": "Привет. это чат - бот Dream А где у меня написано что я на полном серьезе пишу Я не говорю что надо всем но надо?", + "segments": [ + "Привет.", + "это чат - бот Dream А где у меня написано что я на полном серьезе пишу Я не говорю что надо всем но надо?" + ] + }, + "ner": [ + [], + [] + ] + }, + "active_skill": "dff_generative_skill", "user": { + "id": "c0c3987ecba44b2bb389b46518d67949", + "persona": {}, + "attributes": {}, "user_type": "bot" - }, - "annotations": {} + } } ] } ], "dff_generative_skill_state_batch": [ - {} + { + "context": { + "actor_state": {}, + "id": "7c92d5fe-bdb3-45ce-984e-5b00e0dab8bb", + "labels": { + "0": [ + "generation", + "generative_response_node" + ] + }, + "misc": {}, + "requests": { + "0": "привет." + }, + "responses": { + "0": [ + [ + "А то я уже отчаялся найти вас в своей группе и даже на этом подсайте.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "А где у меня написано, что я на полном серьезе пишу? Я не говорю, что надо всем, но надо.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "И, что характерно, \" не так всё плохо \" — это, оказывается, \" плохо \".", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ] + ] + }, + "validation": false + }, + "current_turn_dff_suspended": false, + "history": { + "0": [ + "generation", + "generative_response_node" + ] + }, + "previous_human_utter_index": 0, + "shared_memory": {} + } ], "dff_shared_state_batch": [ { - "cross_states": {}, - "cross_links": {} + "cross_links": {}, + "cross_states": {} } ], "entities_batch": [ @@ -47,13 +421,12 @@ {} ], "age_group_batch": [ - "unknown" + "" ], "disliked_skills_batch": [ [] ], "clarification_request_flag_batch": [ false - ], - "random_seed": 2718 + ] } \ No newline at end of file diff --git a/skills/dff_generative_skill/tests/lets_talk_out.json b/skills/dff_generative_skill/tests/lets_talk_out.json index 04fcd615f2..0f1042666e 100644 --- a/skills/dff_generative_skill/tests/lets_talk_out.json +++ b/skills/dff_generative_skill/tests/lets_talk_out.json @@ -1,48 +1,343 @@ [ [ [ - "А я тебе советую книгу почитать.", - "\"Скотт Пилигрим против всех\"", - "не смотри.", - "\"Скотт Пилигрим против всех\"", - "не смотри." + "про любовь.", + "какой?", + "Причем желательно с субтитрами." ], [ - 0.99, - 0.99, 0.99, 0.99, 0.99 ], [ - {}, - {}, - {}, - {}, - {} + { + "dff_generative_skill_state": { + "shared_memory": {}, + "previous_human_utter_index": 1, + "history": { + "0": [ + "generation", + "generative_response_node" + ], + "1": [ + "generation", + "generative_response_node" + ] + }, + "current_turn_dff_suspended": false, + "context": { + "id": "7c92d5fe-bdb3-45ce-984e-5b00e0dab8bb", + "labels": { + "0": [ + "generation", + "generative_response_node" + ], + "1": [ + "generation", + "generative_response_node" + ] + }, + "requests": { + "0": "привет.", + "1": "посоветуй мне фильм." + }, + "responses": { + "0": [ + [ + "А то я уже отчаялся найти вас в своей группе и даже на этом подсайте.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "А где у меня написано, что я на полном серьезе пишу? Я не говорю, что надо всем, но надо.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "И, что характерно, \" не так всё плохо \" — это, оказывается, \" плохо \".", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ] + ], + "1": [ + [ + "про любовь.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "какой?", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "Причем желательно с субтитрами.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ] + ] + }, + "misc": {}, + "validation": false, + "actor_state": {} + } + }, + "dff_shared_state": { + "cross_links": {}, + "cross_states": {} + }, + "used_links": {}, + "age_group": "", + "disliked_skills": [] + }, + { + "dff_generative_skill_state": { + "shared_memory": {}, + "previous_human_utter_index": 1, + "history": { + "0": [ + "generation", + "generative_response_node" + ], + "1": [ + "generation", + "generative_response_node" + ] + }, + "current_turn_dff_suspended": false, + "context": { + "id": "7c92d5fe-bdb3-45ce-984e-5b00e0dab8bb", + "labels": { + "0": [ + "generation", + "generative_response_node" + ], + "1": [ + "generation", + "generative_response_node" + ] + }, + "requests": { + "0": "привет.", + "1": "посоветуй мне фильм." + }, + "responses": { + "0": [ + [ + "А то я уже отчаялся найти вас в своей группе и даже на этом подсайте.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "А где у меня написано, что я на полном серьезе пишу? Я не говорю, что надо всем, но надо.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "И, что характерно, \" не так всё плохо \" — это, оказывается, \" плохо \".", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ] + ], + "1": [ + [ + "про любовь.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "какой?", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "Причем желательно с субтитрами.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ] + ] + }, + "misc": {}, + "validation": false, + "actor_state": {} + } + }, + "dff_shared_state": { + "cross_links": {}, + "cross_states": {} + }, + "used_links": {}, + "age_group": "", + "disliked_skills": [] + }, + { + "dff_generative_skill_state": { + "shared_memory": {}, + "previous_human_utter_index": 1, + "history": { + "0": [ + "generation", + "generative_response_node" + ], + "1": [ + "generation", + "generative_response_node" + ] + }, + "current_turn_dff_suspended": false, + "context": { + "id": "7c92d5fe-bdb3-45ce-984e-5b00e0dab8bb", + "labels": { + "0": [ + "generation", + "generative_response_node" + ], + "1": [ + "generation", + "generative_response_node" + ] + }, + "requests": { + "0": "привет.", + "1": "посоветуй мне фильм." + }, + "responses": { + "0": [ + [ + "А то я уже отчаялся найти вас в своей группе и даже на этом подсайте.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "А где у меня написано, что я на полном серьезе пишу? Я не говорю, что надо всем, но надо.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "И, что характерно, \" не так всё плохо \" — это, оказывается, \" плохо \".", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ] + ], + "1": [ + [ + "про любовь.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "какой?", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ], + [ + "Причем желательно с субтитрами.", + 0.99, + {}, + {}, + { + "can_continue": "can" + } + ] + ] + }, + "misc": {}, + "validation": false, + "actor_state": {} + } + }, + "dff_shared_state": { + "cross_links": {}, + "cross_states": {} + }, + "used_links": {}, + "age_group": "", + "disliked_skills": [] + } ], [ - {}, - {}, {}, {}, {} ], [ { - "can_continue": "no" - }, - { - "can_continue": "no" - }, - { - "can_continue": "no" + "can_continue": "can" }, { - "can_continue": "no" + "can_continue": "can" }, { - "can_continue": "no" + "can_continue": "can" } ] ] From c9da4603e9cd300d3a35678fc06ab65a23639ee4 Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Mon, 14 Nov 2022 15:30:26 +0300 Subject: [PATCH 2/5] Expand random skills list by one (#210) --- tests/dream/assert_test_dialogs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/dream/assert_test_dialogs.py b/tests/dream/assert_test_dialogs.py index 33b307b283..4e42535144 100644 --- a/tests/dream/assert_test_dialogs.py +++ b/tests/dream/assert_test_dialogs.py @@ -44,6 +44,7 @@ "dff_wiki_skill", "game_cooperative_skill", "dialogpt", + "dialogpt_persona_based", ], } From 618b70890005928e5d22361d9c3f6a310c15599d Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Tue, 15 Nov 2022 10:55:45 +0300 Subject: [PATCH 3/5] fix: use language variable in agent container (#209) * fix: use language variable in agent container * fix: codestyle --- .../dream_russian/docker-compose.override.yml | 1 + skills/dummy_skill/connector.py | 18 ++++++++---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/assistant_dists/dream_russian/docker-compose.override.yml b/assistant_dists/dream_russian/docker-compose.override.yml index db893626d2..37b9de2d4c 100644 --- a/assistant_dists/dream_russian/docker-compose.override.yml +++ b/assistant_dists/dream_russian/docker-compose.override.yml @@ -9,6 +9,7 @@ services: dff-friendship-skill:8086, entity-detection:8103, dialogpt:8091, dff-template-skill:8120, spacy-annotator:8125, dialogrpt:8122, toxic-classification:8126" WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480} + LANGUAGE: RU dff-program-y-skill: env_file: [.env] diff --git a/skills/dummy_skill/connector.py b/skills/dummy_skill/connector.py index 1baac00f25..f6f9525cdb 100644 --- a/skills/dummy_skill/connector.py +++ b/skills/dummy_skill/connector.py @@ -42,6 +42,7 @@ LINK_TO_PROB = 0.5 LINK_TO_PHRASES = sum([list(list_el) for list_el in skills_phrases_map.values()], []) +LANGUAGE = getenv("LANGUAGE", "EN") with open("skills/dummy_skill/google-english-no-swears.txt", "r") as f: TOP_FREQUENT_UNIGRAMS = f.read().splitlines()[:1000] @@ -199,11 +200,8 @@ async def send(self, payload: Dict, callback: Callable): human_attrs = [] bot_attrs = [] attrs = [] - prev_human_uttr_text = dialog["human_utterances"][-2]["text"] if len(dialog["human_utterances"]) > 1 else "" - is_russian = re.search(r"[а-яА-Я]+", dialog["human_utterances"][-1]["text"]) or re.search( - r"[а-яА-Я]+", prev_human_uttr_text - ) - if is_russian: + + if LANGUAGE == "RU": cands += [choice(DUMMY_DONTKNOW_RESPONSES["RU"])] else: cands += [choice(DUMMY_DONTKNOW_RESPONSES["EN"])] @@ -212,7 +210,7 @@ async def send(self, payload: Dict, callback: Callable): human_attrs += [{}] bot_attrs += [{}] - if len(dialog["utterances"]) > 14 and not is_sensitive_case and not is_russian: + if len(dialog["utterances"]) > 14 and not is_sensitive_case and LANGUAGE == "EN": questions_same_nps = [] for i, nphrase in enumerate(curr_nounphrases): for q_id in NP_QUESTIONS.get(nphrase, []): @@ -227,7 +225,7 @@ async def send(self, payload: Dict, callback: Callable): bot_attrs += [{}] link_to_question, human_attr = get_link_to_question(dialog, all_prev_active_skills) - if link_to_question and not is_russian: + if link_to_question and LANGUAGE == "EN": _prev_bot_uttr = dialog["bot_utterances"][-2]["text"] if len(dialog["bot_utterances"]) > 1 else "" _bot_uttr = dialog["bot_utterances"][-1]["text"] if len(dialog["bot_utterances"]) > 0 else "" _prev_active_skill = ( @@ -270,14 +268,14 @@ async def send(self, payload: Dict, callback: Callable): attrs += [{"type": "link_to_for_response_selector", "response_parts": ["prompt"]}] human_attrs += [human_attr] bot_attrs += [{}] - elif is_russian: + elif LANGUAGE == "RU": cands += [random.choice(RUSSIAN_RANDOM_QUESTIONS)] confs += [0.8] attrs += [{"type": "link_to_for_response_selector", "response_parts": ["prompt"]}] human_attrs += [{}] bot_attrs += [{}] - if not is_russian: + if LANGUAGE == "EN": facts_same_nps = [] for i, nphrase in enumerate(curr_nounphrases): for fact_id in NP_FACTS.get(nphrase, []): @@ -289,7 +287,7 @@ async def send(self, payload: Dict, callback: Callable): else: facts_same_nps = [] - if len(facts_same_nps) > 0 and not is_sensitive_case and not is_russian: + if len(facts_same_nps) > 0 and not is_sensitive_case and LANGUAGE == "EN": logger.info("Found special nounphrases for facts. Return fact with the same nounphrase.") cands += [choice(facts_same_nps)] confs += [0.5] From 896a163db5b0c3035fa34792e3c42d2dcd32af33 Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Tue, 15 Nov 2022 12:56:02 +0300 Subject: [PATCH 4/5] fix: remove extra logs (#208) --- common/utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/common/utils.py b/common/utils.py index 5b923b4f48..019dcc2875 100644 --- a/common/utils.py +++ b/common/utils.py @@ -764,10 +764,8 @@ def get_topics(annotated_utterance, probs=False, default_probs=None, default_lab answer_probs, answer_labels = default_probs, default_labels if probs: - logger.info(f"Result in get_topics: {answer_probs}") return answer_probs else: - logger.info(f"Result in get_topics: {answer_labels}") return answer_labels @@ -862,10 +860,8 @@ def get_intents(annotated_utterance, probs=False, default_probs=None, default_la answer_probs, answer_labels = default_probs, default_labels if probs: - logger.info(f"Result in get_intents: {answer_probs}") return answer_probs else: - logger.info(f"Result in get_intents: {answer_labels}") return answer_labels From c0588b99ae1f411e10c91be57511675a808157a0 Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Tue, 15 Nov 2022 15:02:18 +0300 Subject: [PATCH 5/5] fix: not add greeting (#211) --- response_selectors/convers_evaluation_based_selector/server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/response_selectors/convers_evaluation_based_selector/server.py b/response_selectors/convers_evaluation_based_selector/server.py index dfcaa22d7f..e21fa3cc3d 100644 --- a/response_selectors/convers_evaluation_based_selector/server.py +++ b/response_selectors/convers_evaluation_based_selector/server.py @@ -50,6 +50,7 @@ "I didn't get it. Sorry", ] LANGUAGE = getenv("LANGUAGE", "EN") +GREETING_FIRST = int(getenv("GREETING_FIRST", 1)) @app.route("/respond", methods=["POST"]) @@ -366,7 +367,7 @@ def select_response(candidates, scores, confidences, is_toxics, dialog, all_prev best_human_attributes = best_candidate.get("human_attributes", {}) best_bot_attributes = best_candidate.get("bot_attributes", {}) - if len(dialog["bot_utterances"]) == 0 and greeting_spec[LANGUAGE] not in best_text: + if len(dialog["bot_utterances"]) == 0 and greeting_spec[LANGUAGE] not in best_text and GREETING_FIRST: # add greeting to the first bot uttr, if it's not already included best_text = f"{HI_THIS_IS_DREAM[LANGUAGE]} {best_text}"