import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score, classification_report import torch import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset from torch.optim import Adam
data = pd.read_excel('MBTI 500.csv') # 파일 경로에 맞게 수정
X = data['text'] y = data['mbti']
label_encoder = LabelEncoder() y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
def tokenize_and_pad(texts, max_len=50): # 텍스트를 토큰화하고 패딩 tokenized_texts = [text.split() for text in texts] padded_texts = [tokens[:max_len] + [''] * max(0, max_len - len(tokens)) for tokens in tokenized_texts] return padded_texts
max_len = 50 # 문장의 최대 길이 X_train_tokens = tokenize_and_pad(X_train, max_len) X_test_tokens = tokenize_and_pad(X_test, max_len)
word_to_idx = {'': 0} for tokens in X_train_tokens + X_test_tokens: for token in tokens: if token not in word_to_idx: word_to_idx[token] = len(word_to_idx)
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
def tokens_to_indices(tokens): return [word_to_idx[token] for token in tokens]
X_train_indices = [tokens_to_indices(tokens) for tokens in X_train_tokens] X_test_indices = [tokens_to_indices(tokens) for tokens in X_test_tokens]
batch_size = 64 train_dataset = TensorDataset(torch.tensor(X_train_indices), torch.tensor(y_train)) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_dataset = TensorDataset(torch.tensor(X_test_indices), torch.tensor(y_test)) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
class LSTMModel(nn.Module): def init(self, vocab_size, embedding_dim, hidden_dim, output_dim): super(LSTMModel, self).init() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) self.fc = nn.Linear(hidden_dim, output_dim) self.softmax = nn.Softmax(dim=1)
def forward(self, x):
x = self.embedding(x)
_, (hidden, _) = self.lstm(x)
output = self.fc(hidden[-1, :, :])
output = self.softmax(output)
return output
vocab_size = len(word_to_idx) embedding_dim = 50 hidden_dim = 100 output_dim = len(label_encoder.classes_) lstm_model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss() optimizer = Adam(lstm_model.parameters(), lr=0.001)
def train_model(model, train_loader, criterion, optimizer, num_epochs=10): model.train() for epoch in range(num_epochs): for inputs, labels in train_loader: optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step()
train_model(lstm_model, train_loader, criterion, optimizer)
lstm_model.eval() all_preds = [] with torch.no_grad(): for inputs, _ in test_loader: outputs = lstm_model(inputs) _, preds = torch.max(outputs, 1) all_preds.extend(preds.numpy())
y_test_encoded = label_encoder.transform(y_test) accuracy = accuracy_score(y_test_encoded, all_preds) print(f'Accuracy: {accuracy:.2f}') print(classification_report(y_test_encoded, all_preds))