diff --git a/nlp/IMDB_for_text_classfication.ipynb b/nlp/IMDB_for_text_classfication.ipynb new file mode 100644 index 0000000..a29e217 --- /dev/null +++ b/nlp/IMDB_for_text_classfication.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"PQQBUxGeOeFl"},"outputs":[],"source":["import numpy as np\n","import matplotlib.pyplot as plt\n","from tensorflow.keras.datasets import imdb"]},{"cell_type":"markdown","metadata":{"id":"WC2xyYXVch9M"},"source":["- IMDB 리뷰 데이터는 기존 데이터 셋과는 달리 이미 훈련 데이터와 테스트 데이터를 50:50 비율로 구분해서 제공\n","- imdb.data_load()의 인자로 num_words를 사용하면 이 데이터에서 등장 빈도 순위로 몇 등까지의 단어를 사용할 것인지를 의미한다.\n","- 예를들어 10,000을 넣으면, 등장 빈도 순위가 1~10,000에 해당하는 단어만 사용한다."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4772,"status":"ok","timestamp":1650125301372,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"},"user_tz":-540},"id":"QxF2Y1sBOjX-","outputId":"f31b56ba-810c-4bbf-b5e8-74aeb248142d"},"outputs":[{"output_type":"stream","name":"stdout","text":["Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz\n","17465344/17464789 [==============================] - 0s 0us/step\n","17473536/17464789 [==============================] - 0s 0us/step\n","훈련용 리뷰 개수: 25000\n","테스트용 리뷰 개수: 25000\n","카테고리: 2\n"]}],"source":["(X_train, y_train), (X_test, y_test) = imdb.load_data()\n","\n","print('훈련용 리뷰 개수: {}'.format(len(X_train)))\n","print('테스트용 리뷰 개수: {}'.format(len(X_test)))\n","num_classes = len(set(y_train))\n","print('카테고리: {}'.format(num_classes))"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1650125301373,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"},"user_tz":-540},"id":"KMZ_ULG_ezYz","outputId":"e83f8662-34d5-49d1-e72f-501f954a8c2c"},"outputs":[{"output_type":"stream","name":"stdout","text":["첫번째 훈련용 리뷰 : [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]\n","첫번째 훈련용 리뷰의 레이블 : 1\n"]}],"source":["print('첫번째 훈련용 리뷰 :', X_train[0])\n","print('첫번째 훈련용 리뷰의 레이블 :', y_train[0])"]},{"cell_type":"markdown","source":["- 케라스의 Embedding()은 단어 각각에 대해 정수로 변환된 입력에 대해서 임베딩 작업을 수행한다.\n","\n","- 단어 각각에 정수를 부여하는 방법으로는 단어를 빈도수 순대로 정렬하고 순차적으로 정수를 부여하는 방법이 있다. 로이터 뉴스와 IMDB 리뷰 데이터는 방법을 사용하였으며 이미 이 작업이 끝난 상태이다.\n","\n","- 등장 빈도 순으로 단어를 정렬하여 정수를 부여하였을 때의 장점은 등장 빈도수가 적은 단어의 제거이다. 예를 들어서 25,000개의 단어가 있다고 가정하고, 해당 단어를 등장 빈도수 순가 높은 순서로 1부터 25,000까지 정수를 부여했다고 하자. 이렇게 되면 등장 빈도 순으로 등수가 부여된 것과 다름없으므로 전처리 작업에서 1,000보다 큰 정수로 맵핑된 단어들을 제거한다면 등장 빈도 상위 1,000개의 단어만 남길 수 있다."],"metadata":{"id":"tE80zquxL3Ed"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"8AljbIPlezb3"},"outputs":[],"source":["# 단어 집합의 크기를 10,000으로 제한하고, 리뷰 최대 길이는 500으로 제한하여 패딩을 진행\n","import re\n","from tensorflow.keras.datasets import imdb\n","from tensorflow.keras.preprocessing.sequence import pad_sequences\n","from tensorflow.keras.models import Sequential\n","from tensorflow.keras.layers import Dense, GRU, Embedding\n","from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n","from tensorflow.keras.models import load_model\n","\n","vocab_size = 10000\n","max_len = 500\n","\n","(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)\n","\n","X_train = pad_sequences(X_train, maxlen=max_len)\n","X_test = pad_sequences(X_test, maxlen=max_len)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jI1OVv1YezfQ","executionInfo":{"status":"ok","timestamp":1649907694771,"user_tz":-540,"elapsed":83393,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"ab0fe9c7-09be-4ac5-c673-0384af5ca528"},"outputs":[{"output_type":"stream","name":"stdout","text":["Epoch 1/15\n","312/313 [============================>.] - ETA: 0s - loss: 0.5099 - acc: 0.7586\n","Epoch 1: val_acc improved from -inf to 0.76940, saving model to GRU_model.h5\n","313/313 [==============================] - 16s 31ms/step - loss: 0.5098 - acc: 0.7585 - val_loss: 0.4884 - val_acc: 0.7694\n","Epoch 2/15\n","312/313 [============================>.] - ETA: 0s - loss: 0.3308 - acc: 0.8698\n","Epoch 2: val_acc improved from 0.76940 to 0.87560, saving model to GRU_model.h5\n","313/313 [==============================] - 9s 29ms/step - loss: 0.3306 - acc: 0.8698 - val_loss: 0.3097 - val_acc: 0.8756\n","Epoch 3/15\n","311/313 [============================>.] - ETA: 0s - loss: 0.2543 - acc: 0.9031\n","Epoch 3: val_acc improved from 0.87560 to 0.88160, saving model to GRU_model.h5\n","313/313 [==============================] - 9s 29ms/step - loss: 0.2543 - acc: 0.9032 - val_loss: 0.3290 - val_acc: 0.8816\n","Epoch 4/15\n","311/313 [============================>.] - ETA: 0s - loss: 0.2096 - acc: 0.9218\n","Epoch 4: val_acc improved from 0.88160 to 0.88700, saving model to GRU_model.h5\n","313/313 [==============================] - 10s 30ms/step - loss: 0.2097 - acc: 0.9216 - val_loss: 0.2762 - val_acc: 0.8870\n","Epoch 5/15\n","312/313 [============================>.] - ETA: 0s - loss: 0.1634 - acc: 0.9416\n","Epoch 5: val_acc did not improve from 0.88700\n","313/313 [==============================] - 9s 29ms/step - loss: 0.1639 - acc: 0.9414 - val_loss: 0.3014 - val_acc: 0.8750\n","Epoch 6/15\n","311/313 [============================>.] - ETA: 0s - loss: 0.1326 - acc: 0.9523\n","Epoch 6: val_acc improved from 0.88700 to 0.88940, saving model to GRU_model.h5\n","313/313 [==============================] - 9s 29ms/step - loss: 0.1327 - acc: 0.9522 - val_loss: 0.2905 - val_acc: 0.8894\n","Epoch 7/15\n","312/313 [============================>.] - ETA: 0s - loss: 0.1038 - acc: 0.9631\n","Epoch 7: val_acc improved from 0.88940 to 0.89380, saving model to GRU_model.h5\n","313/313 [==============================] - 9s 30ms/step - loss: 0.1037 - acc: 0.9632 - val_loss: 0.2838 - val_acc: 0.8938\n","Epoch 8/15\n","312/313 [============================>.] - ETA: 0s - loss: 0.0764 - acc: 0.9742\n","Epoch 8: val_acc did not improve from 0.89380\n","313/313 [==============================] - 9s 29ms/step - loss: 0.0764 - acc: 0.9742 - val_loss: 0.3494 - val_acc: 0.8840\n","Epoch 8: early stopping\n"]}],"source":["embedding_dim = 100\n","hidden_units = 128\n","\n","model = Sequential()\n","model.add(Embedding(vocab_size, embedding_dim))\n","model.add(GRU(hidden_units))\n","model.add(Dense(1, activation='sigmoid'))\n","\n","es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)\n","mc = ModelCheckpoint('GRU_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)\n","\n","model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n","history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"--6-cTfMg8GL","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1649907706034,"user_tz":-540,"elapsed":11267,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"846d47bf-eb3f-4b92-fa85-561d0acb55f5"},"outputs":[{"output_type":"stream","name":"stdout","text":["782/782 [==============================] - 8s 9ms/step - loss: 0.3145 - acc: 0.8847\n","\n"," 테스트 정확도: 0.8847\n"]}],"source":["loaded_model = load_model('GRU_model.h5')\n","print(\"\\n 테스트 정확도: %.4f\" % (loaded_model.evaluate(X_test, y_test)[1]))"]}],"metadata":{"colab":{"name":"IMDB_for_text_classfication.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyM0sdBxCxha7+ZZYohdjL6j"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0} \ No newline at end of file diff --git a/nlp/NLP_with_1D_Conv.ipynb b/nlp/NLP_with_1D_Conv.ipynb new file mode 100644 index 0000000..e4c3a73 --- /dev/null +++ b/nlp/NLP_with_1D_Conv.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"NLP_with_1D_Conv.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyNc9WHOGIVISw819ryygQ1D"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"PQMnrQsXu7I3"},"outputs":[],"source":["from tensorflow.keras import datasets\n","from tensorflow.keras.preprocessing.sequence import pad_sequences"]},{"cell_type":"code","source":["vocab_size = 10000\n","(X_train, y_train), (X_test, y_test) = datasets.imdb.load_data(num_words=vocab_size)"],"metadata":{"id":"NsuwT-cSv2OC"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["print(X_train[:5])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"CjFSAVUnv2QU","executionInfo":{"status":"ok","timestamp":1649766630291,"user_tz":-540,"elapsed":19,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"b282423b-d9f4-4daf-ff7b-7f2ed1bc3e3a"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32])\n"," list([1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 8255, 2, 349, 2637, 148, 605, 2, 8003, 15, 123, 125, 68, 2, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 2, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 2, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95])\n"," list([1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14, 22, 112, 4, 2401, 311, 12, 16, 3711, 33, 75, 43, 1829, 296, 4, 86, 320, 35, 534, 19, 263, 4821, 1301, 4, 1873, 33, 89, 78, 12, 66, 16, 4, 360, 7, 4, 58, 316, 334, 11, 4, 1716, 43, 645, 662, 8, 257, 85, 1200, 42, 1228, 2578, 83, 68, 3912, 15, 36, 165, 1539, 278, 36, 69, 2, 780, 8, 106, 14, 6905, 1338, 18, 6, 22, 12, 215, 28, 610, 40, 6, 87, 326, 23, 2300, 21, 23, 22, 12, 272, 40, 57, 31, 11, 4, 22, 47, 6, 2307, 51, 9, 170, 23, 595, 116, 595, 1352, 13, 191, 79, 638, 89, 2, 14, 9, 8, 106, 607, 624, 35, 534, 6, 227, 7, 129, 113])\n"," list([1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153, 103, 4, 1494, 13, 70, 131, 67, 11, 61, 2, 744, 35, 3715, 761, 61, 5766, 452, 9214, 4, 985, 7, 2, 59, 166, 4, 105, 216, 1239, 41, 1797, 9, 15, 7, 35, 744, 2413, 31, 8, 4, 687, 23, 4, 2, 7339, 6, 3693, 42, 38, 39, 121, 59, 456, 10, 10, 7, 265, 12, 575, 111, 153, 159, 59, 16, 1447, 21, 25, 586, 482, 39, 4, 96, 59, 716, 12, 4, 172, 65, 9, 579, 11, 6004, 4, 1615, 5, 2, 7, 5168, 17, 13, 7064, 12, 19, 6, 464, 31, 314, 11, 2, 6, 719, 605, 11, 8, 202, 27, 310, 4, 3772, 3501, 8, 2722, 58, 10, 10, 537, 2116, 180, 40, 14, 413, 173, 7, 263, 112, 37, 152, 377, 4, 537, 263, 846, 579, 178, 54, 75, 71, 476, 36, 413, 263, 2504, 182, 5, 17, 75, 2306, 922, 36, 279, 131, 2895, 17, 2867, 42, 17, 35, 921, 2, 192, 5, 1219, 3890, 19, 2, 217, 4122, 1710, 537, 2, 1236, 5, 736, 10, 10, 61, 403, 9, 2, 40, 61, 4494, 5, 27, 4494, 159, 90, 263, 2311, 4319, 309, 8, 178, 5, 82, 4319, 4, 65, 15, 9225, 145, 143, 5122, 12, 7039, 537, 746, 537, 537, 15, 7979, 4, 2, 594, 7, 5168, 94, 9096, 3987, 2, 11, 2, 4, 538, 7, 1795, 246, 2, 9, 2, 11, 635, 14, 9, 51, 408, 12, 94, 318, 1382, 12, 47, 6, 2683, 936, 5, 6307, 2, 19, 49, 7, 4, 1885, 2, 1118, 25, 80, 126, 842, 10, 10, 2, 2, 4726, 27, 4494, 11, 1550, 3633, 159, 27, 341, 29, 2733, 19, 4185, 173, 7, 90, 2, 8, 30, 11, 4, 1784, 86, 1117, 8, 3261, 46, 11, 2, 21, 29, 9, 2841, 23, 4, 1010, 2, 793, 6, 2, 1386, 1830, 10, 10, 246, 50, 9, 6, 2750, 1944, 746, 90, 29, 2, 8, 124, 4, 882, 4, 882, 496, 27, 2, 2213, 537, 121, 127, 1219, 130, 5, 29, 494, 8, 124, 4, 882, 496, 4, 341, 7, 27, 846, 10, 10, 29, 9, 1906, 8, 97, 6, 236, 2, 1311, 8, 4, 2, 7, 31, 7, 2, 91, 2, 3987, 70, 4, 882, 30, 579, 42, 9, 12, 32, 11, 537, 10, 10, 11, 14, 65, 44, 537, 75, 2, 1775, 3353, 2, 1846, 4, 2, 7, 154, 5, 4, 518, 53, 2, 2, 7, 3211, 882, 11, 399, 38, 75, 257, 3807, 19, 2, 17, 29, 456, 4, 65, 7, 27, 205, 113, 10, 10, 2, 4, 2, 2, 9, 242, 4, 91, 1202, 2, 5, 2070, 307, 22, 7, 5168, 126, 93, 40, 2, 13, 188, 1076, 3222, 19, 4, 2, 7, 2348, 537, 23, 53, 537, 21, 82, 40, 2, 13, 2, 14, 280, 13, 219, 4, 2, 431, 758, 859, 4, 953, 1052, 2, 7, 5991, 5, 94, 40, 25, 238, 60, 2, 4, 2, 804, 2, 7, 4, 9941, 132, 8, 67, 6, 22, 15, 9, 283, 8, 5168, 14, 31, 9, 242, 955, 48, 25, 279, 2, 23, 12, 1685, 195, 25, 238, 60, 796, 2, 4, 671, 7, 2804, 5, 4, 559, 154, 888, 7, 726, 50, 26, 49, 7008, 15, 566, 30, 579, 21, 64, 2574])\n"," list([1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 14, 20, 56, 33, 2401, 18, 457, 88, 13, 2626, 1400, 45, 3171, 13, 70, 79, 49, 706, 919, 13, 16, 355, 340, 355, 1696, 96, 143, 4, 22, 32, 289, 7, 61, 369, 71, 2359, 5, 13, 16, 131, 2073, 249, 114, 249, 229, 249, 20, 13, 28, 126, 110, 13, 473, 8, 569, 61, 419, 56, 429, 6, 1513, 18, 35, 534, 95, 474, 570, 5, 25, 124, 138, 88, 12, 421, 1543, 52, 725, 6397, 61, 419, 11, 13, 1571, 15, 1543, 20, 11, 4, 2, 5, 296, 12, 3524, 5, 15, 421, 128, 74, 233, 334, 207, 126, 224, 12, 562, 298, 2167, 1272, 7, 2601, 5, 516, 988, 43, 8, 79, 120, 15, 595, 13, 784, 25, 3171, 18, 165, 170, 143, 19, 14, 5, 7224, 6, 226, 251, 7, 61, 113])]\n"]}]},{"cell_type":"code","source":["max_len = 200\n","X_train = pad_sequences(X_train, maxlen=max_len)\n","X_test = pad_sequences(X_test, maxlen=max_len)"],"metadata":{"id":"o1kwh9gSv2S7"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["print('X_train의 크기(shape) :',X_train.shape)\n","print('X_test의 크기(shape) :',X_test.shape)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ryqGH4mpv2Vj","executionInfo":{"status":"ok","timestamp":1649766631313,"user_tz":-540,"elapsed":395,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"0075ba84-15a3-4638-ff94-04e5a627ceb9"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["X_train의 크기(shape) : (25000, 200)\n","X_test의 크기(shape) : (25000, 200)\n"]}]},{"cell_type":"code","source":["print(y_train[:5])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"f8X3a3K-v2aO","executionInfo":{"status":"ok","timestamp":1649766631313,"user_tz":-540,"elapsed":5,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"9a6d1352-7ed4-4284-8df3-b8c772278393"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[1 0 0 1 0]\n"]}]},{"cell_type":"code","source":["from tensorflow.keras.models import Sequential\n","from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense\n","from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n","from tensorflow.keras.models import load_model\n","\n","embedding_dim = 256 # 임베딩 벡터의 차원\n","dropout_ratio = 0.3 # 드롭아웃 비율\n","num_filters = 256 # 커널의 수\n","kernel_size = 3 # 커널의 크기\n","hidden_units = 128 # 뉴런의 수\n","\n","model = Sequential()\n","model.add(Embedding(vocab_size, embedding_dim))\n","model.add(Dropout(dropout_ratio))\n","model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))\n","model.add(GlobalMaxPooling1D())\n","model.add(Dense(hidden_units, activation='relu'))\n","model.add(Dropout(dropout_ratio))\n","model.add(Dense(1, activation='sigmoid'))\n","\n","es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)\n","mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)\n","\n","model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n","history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks=[es, mc])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UOgyI6kxwAm2","executionInfo":{"status":"ok","timestamp":1649767612410,"user_tz":-540,"elapsed":981100,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"0c1102b5-aba3-4c05-bbd5-9e6a1e3f73f6"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Epoch 1/20\n","782/782 [==============================] - ETA: 0s - loss: 0.4012 - acc: 0.8030\n","Epoch 1: val_acc improved from -inf to 0.88532, saving model to best_model.h5\n","782/782 [==============================] - 199s 253ms/step - loss: 0.4012 - acc: 0.8030 - val_loss: 0.2733 - val_acc: 0.8853\n","Epoch 2/20\n","782/782 [==============================] - ETA: 0s - loss: 0.2009 - acc: 0.9238\n","Epoch 2: val_acc improved from 0.88532 to 0.89120, saving model to best_model.h5\n","782/782 [==============================] - 196s 251ms/step - loss: 0.2009 - acc: 0.9238 - val_loss: 0.2590 - val_acc: 0.8912\n","Epoch 3/20\n","782/782 [==============================] - ETA: 0s - loss: 0.0937 - acc: 0.9682\n","Epoch 3: val_acc did not improve from 0.89120\n","782/782 [==============================] - 195s 250ms/step - loss: 0.0937 - acc: 0.9682 - val_loss: 0.3462 - val_acc: 0.8813\n","Epoch 4/20\n","782/782 [==============================] - ETA: 0s - loss: 0.0426 - acc: 0.9854\n","Epoch 4: val_acc did not improve from 0.89120\n","782/782 [==============================] - 195s 249ms/step - loss: 0.0426 - acc: 0.9854 - val_loss: 0.4091 - val_acc: 0.8846\n","Epoch 5/20\n","782/782 [==============================] - ETA: 0s - loss: 0.0280 - acc: 0.9899\n","Epoch 5: val_acc did not improve from 0.89120\n","782/782 [==============================] - 195s 250ms/step - loss: 0.0280 - acc: 0.9899 - val_loss: 0.5106 - val_acc: 0.8745\n","Epoch 5: early stopping\n"]}]},{"cell_type":"code","source":["loaded_model = load_model('best_model.h5')\n","print(\"\\n 테스트 정확도: %.4f\" % (loaded_model.evaluate(X_test, y_test)[1]))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3g3eVvINwAp9","executionInfo":{"status":"ok","timestamp":1649767649509,"user_tz":-540,"elapsed":37103,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"ebf62474-f10d-4273-fc82-ed44b45f0c18"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["782/782 [==============================] - 37s 47ms/step - loss: 0.2590 - acc: 0.8912\n","\n"," 테스트 정확도: 0.8912\n"]}]}]} \ No newline at end of file diff --git a/nlp/keras_embedding_layer.ipynb b/nlp/keras_embedding_layer.ipynb new file mode 100644 index 0000000..b09b831 --- /dev/null +++ b/nlp/keras_embedding_layer.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"keras_embedding_layer.ipynb","provenance":[],"authorship_tag":"ABX9TyNMSiSpDjJGYlGbB79ZZuYg"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["import numpy\n","import tensorflow as tf\n","from numpy import array\n","from tensorflow.keras.preprocessing.text import Tokenizer\n","from tensorflow.keras.preprocessing.sequence import pad_sequences\n","from tensorflow.keras.models import Sequential\n","from tensorflow.keras.layers import Dense,Flatten,Embedding\n"," \n","# 텍스트 리뷰 자료를 지정합니다.\n","docs = [\"너무 재밌네요\",\"최고예요\",\"참 잘 만든 영화예요\",\"추천하고 싶은 영화입니다\",\"한번 더 보고싶네요\",\"글쎄요\",\"별로예요\",\"생각보다 지루하네요\",\"연기가 어색해요\",\"재미없어요\"]\n","\n","# 긍정 리뷰는 1, 부정 리뷰는 0으로 클래스를 지정합니다.\n","classes = array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])"],"metadata":{"id":"DmBgNX1WBOil","executionInfo":{"status":"ok","timestamp":1650025206140,"user_tz":-540,"elapsed":284,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}}},"execution_count":36,"outputs":[]},{"cell_type":"code","source":["# 토큰화 \n","token = Tokenizer()\n","token.fit_on_texts(docs)\n","print(token.word_index)\n","x = token.texts_to_sequences(docs)\n","print(\"\\n리뷰 텍스트, 토큰화 결과:\\n\", x)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8jEkvLYbKeHz","executionInfo":{"status":"ok","timestamp":1650025206594,"user_tz":-540,"elapsed":7,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"621de15b-3c64-4338-a532-08afd518c5dd"},"execution_count":37,"outputs":[{"output_type":"stream","name":"stdout","text":["{'너무': 1, '재밌네요': 2, '최고예요': 3, '참': 4, '잘': 5, '만든': 6, '영화예요': 7, '추천하고': 8, '싶은': 9, '영화입니다': 10, '한번': 11, '더': 12, '보고싶네요': 13, '글쎄요': 14, '별로예요': 15, '생각보다': 16, '지루하네요': 17, '연기가': 18, '어색해요': 19, '재미없어요': 20}\n","\n","리뷰 텍스트, 토큰화 결과:\n"," [[1, 2], [3], [4, 5, 6, 7], [8, 9, 10], [11, 12, 13], [14], [15], [16, 17], [18, 19], [20]]\n"]}]},{"cell_type":"code","source":["# 패딩, 서로 다른 길이의 데이터를 4로 맞추어 줍니다.\n","padded_x = pad_sequences(x, 4) \n","print(\"\\n패딩 결과:\\n\", padded_x)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QnJpouAUKhDy","executionInfo":{"status":"ok","timestamp":1650025206595,"user_tz":-540,"elapsed":7,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"cabc0818-01e3-4292-9958-81475648b6bc"},"execution_count":38,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","패딩 결과:\n"," [[ 0 0 1 2]\n"," [ 0 0 0 3]\n"," [ 4 5 6 7]\n"," [ 0 8 9 10]\n"," [ 0 11 12 13]\n"," [ 0 0 0 14]\n"," [ 0 0 0 15]\n"," [ 0 0 16 17]\n"," [ 0 0 18 19]\n"," [ 0 0 0 20]]\n"]}]},{"cell_type":"code","source":["#임베딩에 입력될 단어의 수를 지정합니다.\n","word_size = len(token.word_index) + 1"],"metadata":{"id":"3jiNgvXdKjh2","executionInfo":{"status":"ok","timestamp":1650025206595,"user_tz":-540,"elapsed":5,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}}},"execution_count":39,"outputs":[]},{"cell_type":"code","source":["#단어 임베딩을 포함하여 딥러닝 모델을 만들고 결과를 출력합니다.\n","model = Sequential()\n","model.add(Embedding(word_size, 8, input_length=4))\n","model.add(Flatten())\n","model.add(Dense(1, activation='sigmoid'))\n","model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n","model.fit(padded_x, classes, epochs=20)\n","print(\"\\n Accuracy: %.4f\" % (model.evaluate(padded_x, classes)[1]))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TcUZPU6XKlA6","executionInfo":{"status":"ok","timestamp":1650025244855,"user_tz":-540,"elapsed":38266,"user":{"displayName":"JaeYoung Hwang","userId":"08071223562055378805"}},"outputId":"24a87457-7753-4b6d-b931-df1130f3490b"},"execution_count":40,"outputs":[{"output_type":"stream","name":"stdout","text":["Epoch 1/20\n","1/1 [==============================] - 38s 38s/step - loss: 0.6923 - accuracy: 0.5000\n","Epoch 2/20\n","1/1 [==============================] - 0s 8ms/step - loss: 0.6904 - accuracy: 0.5000\n","Epoch 3/20\n","1/1 [==============================] - 0s 10ms/step - loss: 0.6885 - accuracy: 0.5000\n","Epoch 4/20\n","1/1 [==============================] - 0s 17ms/step - loss: 0.6866 - accuracy: 0.6000\n","Epoch 5/20\n","1/1 [==============================] - 0s 9ms/step - loss: 0.6847 - accuracy: 0.8000\n","Epoch 6/20\n","1/1 [==============================] - 0s 9ms/step - loss: 0.6828 - accuracy: 0.8000\n","Epoch 7/20\n","1/1 [==============================] - 0s 8ms/step - loss: 0.6809 - accuracy: 0.8000\n","Epoch 8/20\n","1/1 [==============================] - 0s 14ms/step - loss: 0.6790 - accuracy: 0.8000\n","Epoch 9/20\n","1/1 [==============================] - 0s 13ms/step - loss: 0.6771 - accuracy: 0.8000\n","Epoch 10/20\n","1/1 [==============================] - 0s 9ms/step - loss: 0.6752 - accuracy: 0.9000\n","Epoch 11/20\n","1/1 [==============================] - 0s 8ms/step - loss: 0.6733 - accuracy: 0.9000\n","Epoch 12/20\n","1/1 [==============================] - 0s 9ms/step - loss: 0.6713 - accuracy: 0.9000\n","Epoch 13/20\n","1/1 [==============================] - 0s 7ms/step - loss: 0.6694 - accuracy: 0.9000\n","Epoch 14/20\n","1/1 [==============================] - 0s 6ms/step - loss: 0.6675 - accuracy: 0.9000\n","Epoch 15/20\n","1/1 [==============================] - 0s 9ms/step - loss: 0.6655 - accuracy: 0.9000\n","Epoch 16/20\n","1/1 [==============================] - 0s 9ms/step - loss: 0.6635 - accuracy: 0.9000\n","Epoch 17/20\n","1/1 [==============================] - 0s 8ms/step - loss: 0.6616 - accuracy: 0.9000\n","Epoch 18/20\n","1/1 [==============================] - 0s 8ms/step - loss: 0.6596 - accuracy: 0.9000\n","Epoch 19/20\n","1/1 [==============================] - 0s 8ms/step - loss: 0.6576 - accuracy: 0.9000\n","Epoch 20/20\n","1/1 [==============================] - 0s 6ms/step - loss: 0.6555 - accuracy: 0.9000\n","1/1 [==============================] - 0s 138ms/step - loss: 0.6535 - accuracy: 0.9000\n","\n"," Accuracy: 0.9000\n"]}]}]} \ No newline at end of file