From 6fb87056b5e11071716dacd6b8850a60dcac2ab0 Mon Sep 17 00:00:00 2001 From: berkedilekoglu Date: Sat, 26 Aug 2023 19:05:36 +0300 Subject: [PATCH] Delete .ipynb_checkpoints directory --- .ipynb_checkpoints/Untitled-checkpoint.ipynb | 6 - .ipynb_checkpoints/tutorial-checkpoint.ipynb | 669 ------------------- 2 files changed, 675 deletions(-) delete mode 100644 .ipynb_checkpoints/Untitled-checkpoint.ipynb delete mode 100644 .ipynb_checkpoints/tutorial-checkpoint.ipynb diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 363fcab..0000000 --- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/.ipynb_checkpoints/tutorial-checkpoint.ipynb b/.ipynb_checkpoints/tutorial-checkpoint.ipynb deleted file mode 100644 index 77ba33c..0000000 --- a/.ipynb_checkpoints/tutorial-checkpoint.ipynb +++ /dev/null @@ -1,669 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# Example Usage of sumonet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Loading Data #" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can load data in 2 different ways:\n", - "\n", - "1) By using Encoding class -> Takes data path or data sequence and output encoded (one-hot, nlf, blosum62) vectors\n", - "\n", - "2) By using Data class -> It does not take any input, output our dbPTM data -> entire or sampled data can be taken" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Class ###" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### You can use our data automatically by using Data Class####" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Data class gives X_train, X_test as samples so you need to encode them \n", - "- y_test, y_train are list so you need to convert them to a 2-d array" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from sumonet.utils.load_data import Data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "data = Data()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "X_train, y_train, X_test, y_test = data.sample_data(ratio = 0.2) #ratio defined as 0.4 in class\n", - "# If you want to use entire data as we did, you can set ratio as 1." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A sample from X_train: LLPPSATASVKMEPENKYLPE\n" - ] - } - ], - "source": [ - "print(f'A sample from X_train: {X_train[0]}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Encode samples and convert label list to 2-d vectors###" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Encoding Class ###" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from sumonet.utils.encodings import Encoding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Define Encoding class ###" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Encoding class takes 2 parameters: encoderTypes and scaler.\n", - "\n", - "- encoderTypes is initially defined as blosum62 according to our experiments but you can use one-hot or nlf also\n", - "- scaler is initially defined as True according to our experiments. It means that data will be passed into min-max scaler. If you want you can cancel it.\n", - "- You can change encoder type with set_encoder_type(encoderType) function" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "encoder = Encoding(encoderType='one-hot') ## Encoding(encoderType = 'blosum62', scale = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "X_train, y_train = encoder.get_encoded_vectors_from_data(X_train, y_train)\n", - "X_test, y_test = encoder.get_encoded_vectors_from_data(X_test, y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape of the train and test samples are: X_train = (1912, 21, 21) || X_test = (211, 21, 21)\n", - "Shape of the train and test labels are: y_train = (1912, 2) || y_test = (211, 2)\n" - ] - } - ], - "source": [ - "print(f\"Shape of the train and test samples are: X_train = {X_train.shape} || X_test = {X_test.shape}\")\n", - "print(f\"Shape of the train and test labels are: y_train = {y_train.shape} || y_test = {y_test.shape}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Or you can use data path (we use ours in that tutorial) to take encoded vectors ###" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### You can give data path ###" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "trainDataPath = \"sumonet/data/train\"\n", - "testDataPath = \"sumonet/data/test\"\n", - "\n", - "dataPathPositiveTrain = trainDataPath+'/Sumoylation_pos_Train.fasta'\n", - "dataPathNegativeTrain = trainDataPath+'/Sumoylation_neg_Train.fasta'\n", - "\n", - "dataPathPositiveTest = testDataPath+'/Sumoylation_pos_Test.fasta'\n", - "dataPathNegativeTest = testDataPath+'/Sumoylation_neg_Test.fasta'" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "#Lets first change encoding type\n", - "encoder.set_encoder_type('blosum62')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### !! The order of the paths is important !! Positive train path should come first ###" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "X_train, y_train = encoder.get_encoded_vectors_from_path(dataPathPositiveTrain,dataPathNegativeTrain)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "X_test, y_test = encoder.get_encoded_vectors_from_path(dataPathPositiveTest,dataPathNegativeTest)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape of the train and test samples are: X_train = (19131, 21, 24) || X_test = (2126, 21, 24)\n", - "Shape of the train and test labels are: y_train = (19131, 2) || y_test = (2126, 2)\n" - ] - } - ], - "source": [ - "print(f\"Shape of the train and test samples are: X_train = {X_train.shape} || X_test = {X_test.shape}\")\n", - "print(f\"Shape of the train and test labels are: y_train = {y_train.shape} || y_test = {y_test.shape}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Now our data is ready ###" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SUMOnet Model ##" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- You can use our architecture with randomly initialized weights\n", - "\n", - "- You can also use our pre-trained model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Let's import SUMOnet ####" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from sumonet.model.architecture import SUMOnet\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### You can use our architecture with randomly initialized weights ####" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "model = SUMOnet()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "### If you want to see summary of the model you need to build it with input shape ###" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "input_shape = X_train.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Build function takes entire shape because it takes batch_size #####" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "model.build(input_shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### model.summary will not show output shape because it is a subclass #####" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: \"sum_onet\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "conv1d (Conv1D) multiple 6272 \n", - "_________________________________________________________________\n", - "bidirectional (Bidirectional multiple 14016 \n", - "_________________________________________________________________\n", - "global_average_pooling1d (Gl multiple 0 \n", - "_________________________________________________________________\n", - "dense (Dense) multiple 2112 \n", - "_________________________________________________________________\n", - "dropout (Dropout) multiple 0 \n", - "_________________________________________________________________\n", - "activation (Activation) multiple 0 \n", - "_________________________________________________________________\n", - "dense_1 (Dense) multiple 8320 \n", - "_________________________________________________________________\n", - "dropout_1 (Dropout) multiple 0 \n", - "_________________________________________________________________\n", - "activation_1 (Activation) multiple 0 \n", - "_________________________________________________________________\n", - "dense_2 (Dense) multiple 16512 \n", - "_________________________________________________________________\n", - "dropout_2 (Dropout) multiple 0 \n", - "_________________________________________________________________\n", - "activation_2 (Activation) multiple 0 \n", - "_________________________________________________________________\n", - "dense_3 (Dense) multiple 258 \n", - "_________________________________________________________________\n", - "activation_3 (Activation) multiple 0 \n", - "=================================================================\n", - "Total params: 47,490\n", - "Trainable params: 47,490\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Let's compile and train our model ####" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/3\n", - "598/598 [==============================] - 11s 14ms/step - loss: 0.5580 - accuracy: 0.7499\n", - "Epoch 2/3\n", - "598/598 [==============================] - 9s 14ms/step - loss: 0.4869 - accuracy: 0.7769\n", - "Epoch 3/3\n", - "598/598 [==============================] - 9s 15ms/step - loss: 0.4598 - accuracy: 0.7958\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.fit(X_train,y_train,epochs=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### You can use pre-trained model###" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- By using load_weights function SUMOnet creates our provided model SUMOnet-3\n", - "- Again you need to build model first with input shape" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from sumonet.model.architecture import SUMOnet\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "SUMOnet3_model = SUMOnet()\n", - "SUMOnet3_model.build(input_shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Let's load weights of pre-trained model ####" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "SUMOnet3_model.load_weights()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Now we can predict ####" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "y_preds = SUMOnet3_model.predict(X_test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Let's evaluate results ###" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### import evaluate function, which organized according to our evaluation set-up ####" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "from sumonet.evaluation.metrics import evaluate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "evaluate function takes 3 arguments:\n", - "- y_test -> Gold labels should be in 1-d so if yours is 2-d as ours, use argmax(-1)\n", - "- y_pred -> Predictions are already 2-d vector\n", - "- string or array that includes metrics\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### You can calculate results one-by-one ####" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "f1_score = evaluate(y_test.argmax(-1),y_preds,'f1')\n", - "mcc = evaluate(y_test.argmax(-1),y_preds,'mcc')\n", - "roc = evaluate(y_test.argmax(-1),y_preds,'roc')\n", - "aupr = evaluate(y_test.argmax(-1),y_preds,'aupr')" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "F1 score: {'f1': 0.6580921757770631}\n", - "MCC score: {'mcc': 0.5694399870602478}\n", - "ROC score: {'roc': 0.8713018549625735}\n", - "AUPR score: {'aupr': 0.7598319565641193}\n" - ] - } - ], - "source": [ - "print(f\"F1 score: \", f1_score)\n", - "print(f\"MCC score: \", mcc)\n", - "print(f\"ROC score: \", roc)\n", - "print(f\"AUPR score: \", aupr)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### You can calculate all results at once ####\n", - "\n", - "- This calculation outputs a dictionary" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'aupr': 0.7598319565641193,\n", - " 'f1': 0.6580921757770631,\n", - " 'mcc': 0.5694399870602478,\n", - " 'roc': 0.8713018549625735}" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "evaluate(y_test.argmax(-1),y_preds,['f1','mcc','roc','aupr'])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}