diff --git a/script/app-question-classification/_cm.json b/script/app-question-classification/_cm.json index 3b481b8..546c064 100644 --- a/script/app-question-classification/_cm.json +++ b/script/app-question-classification/_cm.json @@ -2,9 +2,19 @@ "alias": "app-question-classification", "automation_alias": "script", "automation_uid": "5b4e0237da074764", - "cache": false, + "cache": true, "default_variation": "rh", "deps": [ + { + "tags": "set,echo-off,win" + }, + { + "names": [ + "python", + "python3" + ], + "tags": "get,python3" + }, { "tags": "get,dataset,original,question-topic,go" }, @@ -20,9 +30,6 @@ { "tags": "get,generic-python-lib,_numpy" }, - { - "tags": "get,generic-python-lib,_torch" - }, { "tags": "get,generic-python-lib,_tqdm" }, @@ -43,7 +50,8 @@ "CM_DATASET": "QUESTION-TOPIC" }, "new_env_keys": [ - "CM_DATASET_OUTPUT_MODEL*" + "CM_DATASET_OUTPUT_MODEL*", + "CM_ML_MODEL_ANSWER" ], "tags": [ "app", @@ -61,6 +69,9 @@ "deps":[ { "tags": "get, ml-model, question-classification, go, qa, question-topic, _rh" + }, + { + "tags": "get,generic-python-lib,_torch" } ] }, @@ -71,9 +82,48 @@ "deps":[ { "tags": "get, ml-model, question-classification, go, qa, question-topic, _rt" + }, + { + "tags": "get,generic-python-lib,_torch" + } + ] + }, + "LLMGPT3.5":{ + "env":{ + "CM_ML_MODEL_NAME": "GPT3.5" + }, + "deps":[ + { + "tags": "get,tags,preprocessed,dataset" + }, + { + "tags": "get,generic-python-lib,_openai" } ] }, + "LLMCLAUDE_SONNET":{ + "env":{ + "CM_ML_MODEL_NAME": "CLAUDE_SONNET" + }, + "deps":[ + { + "tags": "get,tags,preprocessed,dataset" + }, + { + "tags": "get,generic-python-lib,_anthropic" + } + ] + }, + "OPENAI_API_KEY.#":{ + "env":{ + "OPENAI_API_KEY" : "#" + } + }, + "ANTHROPIC_API_KEY.#":{ + "env":{ + "ANTHROPIC_API_KEY" : "#" + } + }, "path.#":{ "env": { "CM_DATASET_PATH": "#" diff --git a/script/app-question-classification/customize.py b/script/app-question-classification/customize.py index d756a12..40a5d1e 100644 --- a/script/app-question-classification/customize.py +++ b/script/app-question-classification/customize.py @@ -1,29 +1,5 @@ from cmind import utils import os -import pickle -import pandas as pd -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.svm import LinearSVC -import csv -import numpy as np -from setfit import SetFitModel -from datasets import load_dataset -import torch - - -#get the input data to train the model -def get_data_file(filename): - dtrain=pd.read_csv(filename,header=0) - return dtrain - -def get_trainfile_solnval(filename): - testfile = get_data_file(filename) - l = [] - data = testfile['Tag'] - for value in data: - if value not in l: - l.append(value) - return l def preprocess(i): @@ -31,67 +7,10 @@ def preprocess(i): env = i['env'] - if(env['CM_ML_MODEL_NAME'] == "go_2"): - dataset = load_dataset("ANANDHU-SCT/TOPIC_CLASSIFICATION") - model = SetFitModel.from_pretrained(env['CM_ML_MODEL']) - probs = model.predict_proba(dataset['test']['Question']) - final_result = [] - resultfile = pd.DataFrame() - resultfile["Question"] = dataset["test"]["Question"] - resultfile["Tag"] = dataset["test"]["Tag"] - resultfile["Actual soln"] = dataset["test"]["label"] - for prob in probs: - print(type(prob)) - try: - topk_values, topk_indices = torch.topk(torch.from_numpy(prob), k=5) - except: - topk_values, topk_indices = torch.topk(prob, k=5) - # print(torch.argmax(prob, dim=0) - final_result.append(topk_indices.tolist()) - resultfile["PredictedLabels"] = final_result - resultfile.to_csv('Predicted_answers.csv') - - return {'return':0} - # print(probs) - - else: - testfile = get_data_file(env['CM_PREPROCESSED_DATASET_TEST_PATH']) - ans_list = get_trainfile_solnval(env['CM_PREPROCESSED_DATASET_TRAIN_PATH']) - soln_file = get_data_file(env['CM_DATASET_SOLUTION_PATH'])["Tag"] - - loaded_model = pickle.load(open(env['CM_ML_MODEL'], 'rb')) - tfidfvect = pickle.load(open(env['CM_DATASET_TRAINED_MODEL_TFIDQ'], 'rb')) - - p=loaded_model.predict(tfidfvect.transform(testfile['Question'])) - prob=loaded_model.predict_proba(tfidfvect.transform(testfile['Question'])) - - main_list=[] - sub_list=[] - solutions = [] - sub_solutions = [] - - for ques_prob in prob: - for probs in ques_prob: - if probs>0.02: - sub_list.append(probs) - index = np.where(ques_prob==probs)[0].tolist()[0] - sub_solutions.append(ans_list[index]) - main_list.append(sub_list) - solutions.append(sub_solutions) - sub_list=[] - sub_solutions=[] - - testfile['Tag'] = p - testfile['Actual soln'] = soln_file - testfile['PredictedLabels'] = solutions - testfile['Probabilities'] = main_list - - testfile.to_csv('Predicted_answers.csv') - - return {'return':0} + return {'return':0} def postprocess(i): env = i['env'] - env['CM_ML_MODEL_ANSWER'] = os.path.join(os.getcwd(),"Predicted_answers.csv") + # env['CM_ML_MODEL_ANSWER'] = os.path.join(os.getcwd(),"Predicted_answers.csv") return {'return':0} diff --git a/script/app-question-classification/process.py b/script/app-question-classification/process.py new file mode 100644 index 0000000..61ff222 --- /dev/null +++ b/script/app-question-classification/process.py @@ -0,0 +1,201 @@ +import os +import pickle +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.svm import LinearSVC +import csv +import numpy as np + +from datasets import load_dataset + +import json +from tqdm import tqdm + +#get the input data to train the model +def get_data_file(filename): + dtrain=pd.read_csv(filename,header=0) + return dtrain + +def get_trainfile_solnval(filename): + testfile = get_data_file(filename) + l = [] + data = testfile['Tag'] + for value in data: + if value not in l: + l.append(value) + return l + +#get the openAI client object +def getOpenAIClient(APIKEY): + from openai import OpenAI + client = OpenAI(api_key=APIKEY) + return client + +#get the anthropic client object +def getAnthropicClient(APIKEY): + import anthropic + client = anthropic.Anthropic( + api_key=APIKEY, + ) + return client + +#get the response form OpenAI +def getOpenAIresponse(openAIClient, content): + response = openAIClient.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": content} + ], + temperature=0.2, + top_p=0.1 + ) + formattedResponse = response.choices[0].message.content + return formattedResponse + +def getAnthropicresponse(anthropicClient, content): + message = anthropicClient.messages.create( + model="claude-3-sonnet-20240229", + max_tokens=204, + messages=[ + {"role": "user", "content": content} + ] + ) + formattedResponse = message.content[0].text + return formattedResponse + + +if(os.environ['CM_ML_MODEL_NAME'] == "go_2"): + from setfit import SetFitModel + import torch + dataset = load_dataset("ANANDHU-SCT/TOPIC_CLASSIFICATION") + model = SetFitModel.from_pretrained(os.environ['CM_ML_MODEL']) + probs = model.predict_proba(dataset['test']['Question']) + final_result = [] + resultfile = pd.DataFrame() + resultfile["Question"] = dataset["test"]["Question"] + resultfile["Tag"] = dataset["test"]["Tag"] + resultfile["Actual soln"] = dataset["test"]["label"] + for prob in probs: + print(type(prob)) + try: + topk_values, topk_indices = torch.topk(torch.from_numpy(prob), k=5) + except: + topk_values, topk_indices = torch.topk(prob, k=5) + # print(torch.argmax(prob, dim=0) + final_result.append(topk_indices.tolist()) + resultfile["PredictedLabels"] = final_result + resultfile.to_csv('Predicted_answers.csv') + + # print(probs) +elif(os.environ['CM_ML_MODEL_NAME'] == "CLAUDE_SONNET"): + testfile = get_data_file(os.environ['CM_DATASET_SOLUTION_PATH']) + tagListPath = os.environ["CM_DATASET_TAGS"] + tagListPath=r"{}".format(tagListPath) + anthropicClient = getAnthropicClient(os.environ["ANTHROPIC_API_KEY"]) + with open(tagListPath, 'r') as file: + # Load the JSON data + data = json.load(file) + # Obtain the value of the key + tagList = None + for key in data: + tagList = data[key] + predictedTagList = [] + for question in tqdm(testfile['Question']): + fewShotPrompt = f""" + You are an expert topic classifier. Your role is to analyse a question and classify the question into any of the following question Tags: + {tagList} + The steps to infer the answer would be: + 1. Analyse the question + 2. Infer the topic to which the question belongs. + 3. Check if the question belongs to any of the specific area within the infered topic. eg; if the topic to which the question belongs is arrays and on further analysis, if the question can be specifically associated with subtopic such as array multipliers (which comes within the topic array), then the topic to be asssigned to the question is array multiplier. + 4. If any specific subtopic is found , return that as answer else return the topic as the answer. + 5. return only the answer as string , dont explain. + + Here are some examples: + + input: The expression large frac x y x y 2 is equal to The maximum of x and y The minimum of x and y 1 None of the above + output: Absolute Value + + input: The number of full and half adders required to add 16 bit numbers is 8 half adders 8 full adders 1 half adder 15 full adders 16 half adders 0 full adders 4 half adders 12 full adders + output: Adder + + input: Suppose a fair six sided die is rolled once If the value on the die is 1 2 or 3 the die is rolled a second time What is the probability that the sum total of values that turn up is at least 6 dfrac 10 21 dfrac 5 12 dfrac 2 3 dfrac 1 6 + output: Bayers Theorem + + input: {question} + output: + """ + response = getAnthropicresponse(anthropicClient, fewShotPrompt) + predictedTagList.append(response) + testfile['predictedTags'] = predictedTagList + testfile.to_csv(os.path.join(os.getcwd(),'Predicted_answers.csv')) + +elif(os.environ['CM_ML_MODEL_NAME'] == "GPT3.5"): + testfile = get_data_file(os.environ['CM_DATASET_SOLUTION_PATH']) + tagListPath = os.environ["CM_DATASET_TAGS"] + tagListPath=r"{}".format(tagListPath) + openAIClient = getOpenAIClient(os.environ["OPENAI_API_KEY"]) + with open(tagListPath, 'r') as file: + # Load the JSON data + data = json.load(file) + # Obtain the value of the key + tagList = None + for key in data: + tagList = data[key] + predictedTagList = [] + for question in tqdm(testfile['Question']): + fewShotPrompt = f""" + You are an expert topic classifier. Your role is to analyse a question and classify the question into any of the following question Tags: + {tagList} + The steps to infer the answer would be: + 1. Analyse the question + 2. Infer the topic to which the question belongs. + 3. Check if the question belongs to any of the specific area within the infered topic. eg; if the topic to which the question belongs is arrays and on further analysis, if the question can be specifically associated with subtopic such as array multipliers (which comes within the topic array), then the topic to be asssigned to the question is array multiplier. + 4. If any specific subtopic is found , return that as answer else return the topic as the answer. + 5. return only the answer as string , dont explain. + + Here are some examples: + + input: The expression large frac x y x y 2 is equal to The maximum of x and y The minimum of x and y 1 None of the above + output: Absolute Value + + input: The number of full and half adders required to add 16 bit numbers is 8 half adders 8 full adders 1 half adder 15 full adders 16 half adders 0 full adders 4 half adders 12 full adders + output: Adder + + input: Suppose a fair six sided die is rolled once If the value on the die is 1 2 or 3 the die is rolled a second time What is the probability that the sum total of values that turn up is at least 6 dfrac 10 21 dfrac 5 12 dfrac 2 3 dfrac 1 6 + output: Bayers Theorem + + input: {question} + output: + """ + response = getOpenAIresponse(openAIClient, fewShotPrompt) + predictedTagList.append(response) + testfile['predictedTags'] = predictedTagList + testfile.to_csv(os.path.join(os.getcwd(),'Predicted_answers.csv')) +else: + testfile = get_data_file(os.environ['CM_PREPROCESSED_DATASET_TEST_PATH']) + ans_list = get_trainfile_solnval(os.environ['CM_PREPROCESSED_DATASET_TRAIN_PATH']) + soln_file = get_data_file(os.environ['CM_DATASET_SOLUTION_PATH'])["Tag"] + loaded_model = pickle.load(open(os.environ['CM_ML_MODEL'], 'rb')) + tfidfvect = pickle.load(open(os.environ['CM_DATASET_TRAINED_MODEL_TFIDQ'], 'rb')) + p=loaded_model.predict(tfidfvect.transform(testfile['Question'])) + prob=loaded_model.predict_proba(tfidfvect.transform(testfile['Question'])) + main_list=[] + sub_list=[] + solutions = [] + sub_solutions = [] + for ques_prob in prob: + for probs in ques_prob: + if probs>0.02: + sub_list.append(probs) + index = np.where(ques_prob==probs)[0].tolist()[0] + sub_solutions.append(ans_list[index]) + main_list.append(sub_list) + solutions.append(sub_solutions) + sub_list=[] + sub_solutions=[] + testfile['Tag'] = p + testfile['Actual soln'] = soln_file + testfile['PredictedLabels'] = solutions + testfile['Probabilities'] = main_list + testfile.to_csv('Predicted_answers.csv') \ No newline at end of file diff --git a/script/app-question-classification/run.bat b/script/app-question-classification/run.bat new file mode 100644 index 0000000..4db8d84 --- /dev/null +++ b/script/app-question-classification/run.bat @@ -0,0 +1,4 @@ +@echo off +"%CM_PYTHON_BIN_WITH_PATH%" "%CM_TMP_CURRENT_SCRIPT_PATH%\process.py" +echo CM_ML_MODEL_ANSWER=%CD%\Predicted_answers.csv > tmp-run-env.out +if %errorlevel% neq 0 exit /b 1 \ No newline at end of file diff --git a/script/app-question-classification/run.sh b/script/app-question-classification/run.sh index 05a7907..8077776 100644 --- a/script/app-question-classification/run.sh +++ b/script/app-question-classification/run.sh @@ -1,2 +1,8 @@ #!/bin/bash +echo "$CM_PYTHON_BIN_WITH_PATH" "$CM_TMP_CURRENT_SCRIPT_PATH/process.py" +"$CM_PYTHON_BIN_WITH_PATH" "$CM_TMP_CURRENT_SCRIPT_PATH/process.py" +if [ $? -ne 0 ]; then + exit 1 +fi +echo "CM_ML_MODEL_ANSWER=$(pwd)/Predicted_answers.csv" > tmp-run-env.out \ No newline at end of file diff --git a/script/get-dataset-question-topic-go/run.bat b/script/get-dataset-question-topic-go/run.bat new file mode 100644 index 0000000..c73a13d --- /dev/null +++ b/script/get-dataset-question-topic-go/run.bat @@ -0,0 +1,3 @@ +copy %CM_TMP_CURRENT_SCRIPT_PATH%\data\train.csv . +copy %CM_TMP_CURRENT_SCRIPT_PATH%\data\test.csv . +copy %CM_TMP_CURRENT_SCRIPT_PATH%\data\solution.csv . \ No newline at end of file diff --git a/script/get-preprocessed-dataset-question-topic-go/_cm.json b/script/get-preprocessed-dataset-question-topic-go/_cm.json index 3848cd4..74382aa 100644 --- a/script/get-preprocessed-dataset-question-topic-go/_cm.json +++ b/script/get-preprocessed-dataset-question-topic-go/_cm.json @@ -16,6 +16,9 @@ }, { "tags": "get,generic-python-lib,_tqdm" + }, + { + "tags": "get,generic-python-lib,_pandas" } ], "env": { diff --git a/script/get-preprocessed-dataset-question-topic-go/run.bat b/script/get-preprocessed-dataset-question-topic-go/run.bat new file mode 100644 index 0000000..07001fa --- /dev/null +++ b/script/get-preprocessed-dataset-question-topic-go/run.bat @@ -0,0 +1,4 @@ +@echo off +"%CM_PYTHON_BIN_WITH_PATH%" "%CM_TMP_CURRENT_SCRIPT_PATH%\process.py" +if %errorlevel% neq 0 exit /b 1 +echo CM_PREPROCESSED_DATASET_PATH=%CD%\questions.csv > tmp-run-env.out diff --git a/script/get-tags-from-preprocessed-dataset/_cm.json b/script/get-tags-from-preprocessed-dataset/_cm.json new file mode 100644 index 0000000..674e46f --- /dev/null +++ b/script/get-tags-from-preprocessed-dataset/_cm.json @@ -0,0 +1,40 @@ +{ + "alias": "get-tags-from-preprocessed-dataset", + "automation_alias": "script", + "automation_uid": "5b4e0237da074764", + "cache": true, + "deps": [ + { + "tags": "set,echo-off,win" + }, + { + "names": [ + "python", + "python3" + ], + "tags": "get,python3" + }, + { + "tags": "get,dataset,original,question-topic,go" + }, + { + "tags": "get,preprocessed,dataset,go,qa,question-topic" + }, + { + "tags": "get,generic-python-lib,_pandas" + } + ], + "env": { + "CM_DATASET": "QUESTION-TOPIC" + }, + "new_env_keys": [ + "CM_DATASET_TAGS" + ], + "tags": [ + "get", + "tags", + "preprocessed", + "dataset" + ], + "uid": "d48bcfe9b6dc40e3" +} diff --git a/script/get-tags-from-preprocessed-dataset/customize.py b/script/get-tags-from-preprocessed-dataset/customize.py new file mode 100644 index 0000000..2ec3912 --- /dev/null +++ b/script/get-tags-from-preprocessed-dataset/customize.py @@ -0,0 +1,16 @@ +import os + + +def preprocess(i): + + os_info = i['os_info'] + env = i['env'] + + return {'return':0} + + +def postprocess(i): + + env = i['env'] + + return {'return':0} diff --git a/script/get-tags-from-preprocessed-dataset/process.py b/script/get-tags-from-preprocessed-dataset/process.py new file mode 100644 index 0000000..419894a --- /dev/null +++ b/script/get-tags-from-preprocessed-dataset/process.py @@ -0,0 +1,15 @@ +import pandas as pd +import json +import os + +def get_data_file(filename): + dtrain=pd.read_csv(filename,header=0) + return dtrain + +testfile = get_data_file(os.environ['CM_PREPROCESSED_DATASET_SOLN_PATH']) +uniqueTags = testfile["Tag"].unique().tolist() +json_file_path = "tagList.json" +# Store the unique tags in a JSON file with key "tags" +with open(json_file_path, 'w') as json_file: + json.dump({"tags": uniqueTags}, json_file) +print(uniqueTags) \ No newline at end of file diff --git a/script/get-tags-from-preprocessed-dataset/run.bat b/script/get-tags-from-preprocessed-dataset/run.bat new file mode 100644 index 0000000..565059c --- /dev/null +++ b/script/get-tags-from-preprocessed-dataset/run.bat @@ -0,0 +1,5 @@ +@echo off +echo "%CM_PYTHON_BIN_WITH_PATH%" "%CM_TMP_CURRENT_SCRIPT_PATH%\process.py" +"%CM_PYTHON_BIN_WITH_PATH%" "%CM_TMP_CURRENT_SCRIPT_PATH%\process.py" +if %errorlevel% neq 0 exit /b 1 +echo CM_DATASET_TAGS=%CD%\tagList.json > tmp-run-env.out \ No newline at end of file diff --git a/script/get-tags-from-preprocessed-dataset/run.sh b/script/get-tags-from-preprocessed-dataset/run.sh new file mode 100644 index 0000000..80c8266 --- /dev/null +++ b/script/get-tags-from-preprocessed-dataset/run.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo "$CM_PYTHON_BIN_WITH_PATH" "$CM_TMP_CURRENT_SCRIPT_PATH/process.py" +"$CM_PYTHON_BIN_WITH_PATH" "$CM_TMP_CURRENT_SCRIPT_PATH/process.py" +if [ $? -ne 0 ]; then + exit 1 +fi +echo "CM_DATASET_TAGS=$(pwd)/tagList.json" > tmp-run-env.out diff --git a/script/postprocess-question-classification/customize.py b/script/postprocess-question-classification/customize.py index eb707b4..d2c9191 100644 --- a/script/postprocess-question-classification/customize.py +++ b/script/postprocess-question-classification/customize.py @@ -24,7 +24,11 @@ def postprocess(i): if(soln[i] in ast.literal_eval(model_soln[i])): correct = correct + 1 print("\tAccuracy by brute force approach is:"+str(correct/len(model_soln))) - + elif(env['CM_ML_MODEL_NAME'] == "GPT3.5" or env['CM_ML_MODEL_NAME'] == "CLAUDE_SONNET"): + modelSolnData=pd.read_csv(env['CM_ML_MODEL_ANSWER'],header=0) + accuracy = (modelSolnData['Tag'] == modelSolnData['predictedTags']).mean() + print(f"Accuracy through {env['CM_ML_MODEL_NAME']} is{accuracy}") + print(f"NOTE: The solution file is present in path: {env['CM_ML_MODEL_ANSWER']}") else: soln=pd.read_csv(env['CM_DATASET_SOLUTION_PATH'],header=0)["Tag"] model_soln=pd.read_csv(env['CM_ML_MODEL_ANSWER'],header=0)["PredictedLabels"]