Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates to grounded flow #53

Merged
merged 2 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/test_freeform_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@

ds = Dataset.from_list(samples)

skills_flow = SynthSkillsFlow(client, teacher_model).get_flow()
skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow()
skills_pipe = Pipeline(skills_flow)

sdg = SDG([skills_pipe])
Expand Down
107 changes: 107 additions & 0 deletions scripts/test_grounded_skills.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Third Party
from datasets import Dataset
from openai import OpenAI

# First Party
from src.instructlab.sdg import SDG
from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow
from src.instructlab.sdg.pipeline import Pipeline

# for vLLM endpoints, the api_key remains "EMPTY"
openai_api_key = "EMPTY"
openai_api_base = "Add model endpoint here"


client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
teacher_model = models.data[0].id

samples = [
{
"seed_context": """*Ms. Thompson:* Good morning, everyone. Today, we''re here to discuss
our customer journey mapping and analysis. I believe this is crucial to understanding
our customers'' experiences and improving our services.


*Mr. Patel:* I agree, Lisa. We should start by identifying all touchpoints in
our customer journey, from initial contact to post-sale support.


*Ms. Rodriguez:* Yes, and let''s not forget about the emotional aspect of the
journey. How do our customers feel at each stage? What are their pain points?


*Mr. Kim:* We can use data from our CRM system to track the customer journey and
gather insights. This will help us create a detailed, data-driven map.


*Ms. Johnson:* Once we have the map, we can analyze it to identify areas for improvement.
Perhaps there are steps where customers drop off or express dissatisfaction.


*Mr. Davis:* We should also consider the customer''s perspective. Conducting interviews
or surveys can provide valuable insights into their thoughts and feelings.


*Ms. Brown:* Absolutely. And once we''ve identified areas for improvement, we
can develop strategies to address them. This might involve redesigning certain
touchpoints, enhancing our communication, or streamlining processes.


*Mr. Smith:* And we must remember to measure the impact of any changes we make.
By tracking key performance indicators, we can determine whether our efforts are
successful.


*Ms. Thompson:* Great suggestions, everyone. Let''s divide into teams to tackle
different aspects of the customer journey. Team 1 will focus on pre-sale touchpoints,
Team 2 on sales, and Team 3 on post-sale support.


*Mr. Patel:* Sounds good. We''ll reconvene in a week to share our findings and
discuss next steps.""",
"seed_question": """Generate a company wide email based on the given meeting transcript""",
"task_description": "Writing emails based on minutes of meeting",
"seed_response": """Subject: Recap and Next Steps - Customer Journey Mapping and Analysis\n\
\nDear [Company Name] Team,\n\nI hope this email finds you well. Yesterday, we\
\ gathered to discuss our customer journey mapping and analysis, aiming to understand\
\ our customers' experiences and identify opportunities for improvement. The discussions\
\ were fruitful, and I want to share a brief overview of the key points and outcomes.\n\
\n1. **Identifying Touchpoints:**\n Mr. Patel suggested mapping all touchpoints\
\ in our customer journey, from initial contact to post-sale support.\n\n2. **Emotional\
\ Aspect and Pain Points:**\n Ms. Rodriguez emphasized the importance of considering\
\ the emotional aspect of the journey and identifying customers' pain points at\
\ each stage.\n\n3. **Data-Driven Mapping:**\n Mr. Kim proposed using data from\
\ our CRM system to create a detailed, data-driven customer journey map.\n\n4.\
\ **Customer Perspective:**\n Ms. Johnson recommended gathering insights from\
\ the customer's perspective through interviews or surveys.\n\n5. **Analysis and\
\ Improvement:**\n Ms. Brown suggested analyzing the customer journey map to\
\ identify areas for improvement and developing strategies to address them.\n\n\
6. **Measuring Impact:**\n Mr. Smith stressed the need to measure the impact\
\ of any changes made by tracking key performance indicators.\n\nTo facilitate\
\ a comprehensive analysis, we have divided into teams to tackle different aspects\
\ of the customer journey:\n\n* Team 1: Pre-sale touchpoints\n* Team 2: Sales\n\
* Team 3: Post-sale support\n\nEach team will share their findings and discuss\
\ next steps in a week.\n\nYour engagement and insights have been invaluable in\
\ understanding our customers' experiences and identifying opportunities for improvement.\
\ I look forward to our continued collaboration as we work towards enhancing our\
\ services and delivering exceptional customer experiences.\n\nBest regards,\n\
\n[Your Full Name]\n[Your Position]\n[Company Name]""",
}
]


ds = Dataset.from_list(samples)

skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow()
skills_pipe = Pipeline(skills_flow)

sdg = SDG([skills_pipe])
gen_data = sdg.generate(ds)

print(gen_data)
print(gen_data[0])
4 changes: 2 additions & 2 deletions scripts/test_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@

ds = Dataset.from_list(samples)

mmlu_flow = MMLUBenchFlow(client, teacher_model).get_flow()
knowledge_flow = SynthKnowledgeFlow(client, teacher_model).get_flow()
mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow()
knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow()
knowledge_pipe = Pipeline(knowledge_flow)
mmlu_pipe = Pipeline(mmlu_flow)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ examples: |
[End of Score]

generation: |
Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above.
Here's the context, question and the answer you need to evaluate:

[Start of Context]
Expand All @@ -45,7 +46,6 @@ generation: |
{answer}
[End of Answer]

Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above.
* Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags.
* Return the score between [Start of Score] and [End of Score] tags.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ examples: |
[End of Score]

generation: |
Here's the context and question you need to evaluate:
Here's the context and question you need to evaluate. Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags.

[Start of Context]
{context}
Expand Down
4 changes: 2 additions & 2 deletions src/instructlab/sdg/configs/skills/freeform_responses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ examples: |
[End of Response]

generation: |
Now generate a response to the following prompt.
Now generate a response to the following prompt. Remember to use the same style and format as the example above.

[Start of Question]
{question}
[End of Question]

Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags.
Return the response between [Start of Response] and [End of Response] tags.

start_tags: ["[Start of Response]"]
end_tags: ["[End of Response]"]
5 changes: 4 additions & 1 deletion src/instructlab/sdg/configs/skills/grounded_responses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ examples: |
[End of Response]

generation: |
Now generate a response to the following prompt. Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags.
Now generate a response to the following prompt. Remember to use the same style and format as the example above.
Return the response between [Start of Response] and [End of Response] tags.

[Start of Context]
{context}
Expand All @@ -35,6 +36,8 @@ generation: |
{question}
[End of Question]

Return the response between [Start of Response] and [End of Response] tags.


start_tags: ["[Start of Response]"]
end_tags: ["[End of Response]"]
36 changes: 26 additions & 10 deletions src/instructlab/sdg/default_flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .filterblock import FilterByValueBlock
from .iterblock import IterBlock
from .llmblock import LLMBlock
from .utilblocks import CombineColumnsBlock

MODEL_FAMILY_MIXTRAL = "mixtral"
MODEL_FAMILY_MERLINITE = "merlinite"
Expand Down Expand Up @@ -225,8 +226,9 @@ def get_flow(self) -> list:
"block_config": {
"block_name": "filter_relevancy",
"filter_column": "score",
"filter_value": "2",
"filter_value": 2.0,
"operation": operator.eq,
"convert_dtype": float,
"batch_kwargs": {
"num_procs": 8,
},
Expand Down Expand Up @@ -258,8 +260,9 @@ def get_flow(self) -> list:
"block_config": {
"block_name": "filter_verify_question",
"filter_column": "rating",
"filter_value": "1",
"filter_value": 1.0,
"operation": operator.eq,
"convert_dtype": float,
"batch_kwargs": {
"num_procs": 8,
},
Expand Down Expand Up @@ -309,9 +312,9 @@ def get_flow(self) -> list:
"block_config": {
"block_name": "filter_questions",
"filter_column": "score",
"filter_value": 1,
"filter_value": 1.0,
"operation": operator.eq,
"convert_dtype": int,
"convert_dtype": float,
"batch_kwargs": {
"num_procs": 8,
},
Expand Down Expand Up @@ -353,9 +356,9 @@ def get_flow(self) -> list:
"block_config": {
"block_name": "filter_qa_pair",
"filter_column": "score",
"filter_value": 2,
"filter_value": 2.0,
"operation": operator.ge,
"convert_dtype": int,
"convert_dtype": float,
"batch_kwargs": {
"num_procs": 8,
},
Expand Down Expand Up @@ -420,6 +423,7 @@ def get_flow(self) -> list:
"batch_kwargs": {
"num_procs": 8,
"batched": self.batched,
"num_samples": 10,
},
},
},
Expand All @@ -428,9 +432,9 @@ def get_flow(self) -> list:
"block_config": {
"block_name": "filter_grounded_questions",
"filter_column": "score",
"filter_value": 1,
"filter_value": 1.0,
"operation": operator.eq,
"convert_dtype": int,
"convert_dtype": float,
"batch_kwargs": {
"num_procs": 8,
},
Expand Down Expand Up @@ -472,12 +476,24 @@ def get_flow(self) -> list:
"block_config": {
"block_name": "filter_grounded_qa_pair",
"filter_column": "score",
"filter_value": 2,
"filter_value": 2.0,
"operation": operator.ge,
"convert_dtype": int,
"convert_dtype": float,
"batch_kwargs": {
"num_procs": 8,
},
},
},
{
"block_type": CombineColumnsBlock,
"block_config": {
"block_name": "combine_question_and_context",
"columns": ["context", "question"],
"output_col": "question",
"batch_kwargs": {
"num_procs": 8,
"batched": True,
},
},
},
]