Skip to content

Commit

Permalink
Merge pull request #6 from anandhu-eng/main
Browse files Browse the repository at this point in the history
Added inference support for OpenAI GPT3.5 Turbo
  • Loading branch information
arjunsuresh authored May 9, 2024
2 parents cc8a3d3 + 5e09e75 commit 9c99aac
Show file tree
Hide file tree
Showing 14 changed files with 367 additions and 89 deletions.
60 changes: 55 additions & 5 deletions script/app-question-classification/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,19 @@
"alias": "app-question-classification",
"automation_alias": "script",
"automation_uid": "5b4e0237da074764",
"cache": false,
"cache": true,
"default_variation": "rh",
"deps": [
{
"tags": "set,echo-off,win"
},
{
"names": [
"python",
"python3"
],
"tags": "get,python3"
},
{
"tags": "get,dataset,original,question-topic,go"
},
Expand All @@ -20,9 +30,6 @@
{
"tags": "get,generic-python-lib,_numpy"
},
{
"tags": "get,generic-python-lib,_torch"
},
{
"tags": "get,generic-python-lib,_tqdm"
},
Expand All @@ -43,7 +50,8 @@
"CM_DATASET": "QUESTION-TOPIC"
},
"new_env_keys": [
"CM_DATASET_OUTPUT_MODEL*"
"CM_DATASET_OUTPUT_MODEL*",
"CM_ML_MODEL_ANSWER"
],
"tags": [
"app",
Expand All @@ -61,6 +69,9 @@
"deps":[
{
"tags": "get, ml-model, question-classification, go, qa, question-topic, _rh"
},
{
"tags": "get,generic-python-lib,_torch"
}
]
},
Expand All @@ -71,9 +82,48 @@
"deps":[
{
"tags": "get, ml-model, question-classification, go, qa, question-topic, _rt"
},
{
"tags": "get,generic-python-lib,_torch"
}
]
},
"LLMGPT3.5":{
"env":{
"CM_ML_MODEL_NAME": "GPT3.5"
},
"deps":[
{
"tags": "get,tags,preprocessed,dataset"
},
{
"tags": "get,generic-python-lib,_openai"
}
]
},
"LLMCLAUDE_SONNET":{
"env":{
"CM_ML_MODEL_NAME": "CLAUDE_SONNET"
},
"deps":[
{
"tags": "get,tags,preprocessed,dataset"
},
{
"tags": "get,generic-python-lib,_anthropic"
}
]
},
"OPENAI_API_KEY.#":{
"env":{
"OPENAI_API_KEY" : "#"
}
},
"ANTHROPIC_API_KEY.#":{
"env":{
"ANTHROPIC_API_KEY" : "#"
}
},
"path.#":{
"env": {
"CM_DATASET_PATH": "#"
Expand Down
85 changes: 2 additions & 83 deletions script/app-question-classification/customize.py
Original file line number Diff line number Diff line change
@@ -1,97 +1,16 @@
from cmind import utils
import os
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import csv
import numpy as np
from setfit import SetFitModel
from datasets import load_dataset
import torch


#get the input data to train the model
def get_data_file(filename):
dtrain=pd.read_csv(filename,header=0)
return dtrain

def get_trainfile_solnval(filename):
testfile = get_data_file(filename)
l = []
data = testfile['Tag']
for value in data:
if value not in l:
l.append(value)
return l

def preprocess(i):

os_info = i['os_info']

env = i['env']

if(env['CM_ML_MODEL_NAME'] == "go_2"):
dataset = load_dataset("ANANDHU-SCT/TOPIC_CLASSIFICATION")
model = SetFitModel.from_pretrained(env['CM_ML_MODEL'])
probs = model.predict_proba(dataset['test']['Question'])
final_result = []
resultfile = pd.DataFrame()
resultfile["Question"] = dataset["test"]["Question"]
resultfile["Tag"] = dataset["test"]["Tag"]
resultfile["Actual soln"] = dataset["test"]["label"]
for prob in probs:
print(type(prob))
try:
topk_values, topk_indices = torch.topk(torch.from_numpy(prob), k=5)
except:
topk_values, topk_indices = torch.topk(prob, k=5)
# print(torch.argmax(prob, dim=0)
final_result.append(topk_indices.tolist())
resultfile["PredictedLabels"] = final_result
resultfile.to_csv('Predicted_answers.csv')

return {'return':0}
# print(probs)

else:
testfile = get_data_file(env['CM_PREPROCESSED_DATASET_TEST_PATH'])
ans_list = get_trainfile_solnval(env['CM_PREPROCESSED_DATASET_TRAIN_PATH'])
soln_file = get_data_file(env['CM_DATASET_SOLUTION_PATH'])["Tag"]

loaded_model = pickle.load(open(env['CM_ML_MODEL'], 'rb'))
tfidfvect = pickle.load(open(env['CM_DATASET_TRAINED_MODEL_TFIDQ'], 'rb'))

p=loaded_model.predict(tfidfvect.transform(testfile['Question']))
prob=loaded_model.predict_proba(tfidfvect.transform(testfile['Question']))

main_list=[]
sub_list=[]
solutions = []
sub_solutions = []

for ques_prob in prob:
for probs in ques_prob:
if probs>0.02:
sub_list.append(probs)
index = np.where(ques_prob==probs)[0].tolist()[0]
sub_solutions.append(ans_list[index])
main_list.append(sub_list)
solutions.append(sub_solutions)
sub_list=[]
sub_solutions=[]

testfile['Tag'] = p
testfile['Actual soln'] = soln_file
testfile['PredictedLabels'] = solutions
testfile['Probabilities'] = main_list

testfile.to_csv('Predicted_answers.csv')

return {'return':0}
return {'return':0}


def postprocess(i):
env = i['env']
env['CM_ML_MODEL_ANSWER'] = os.path.join(os.getcwd(),"Predicted_answers.csv")
# env['CM_ML_MODEL_ANSWER'] = os.path.join(os.getcwd(),"Predicted_answers.csv")
return {'return':0}
Loading

0 comments on commit 9c99aac

Please sign in to comment.