diff --git a/Probelm Starting.ipynb b/Probelm Starting.ipynb new file mode 100644 index 0000000..b088c0d --- /dev/null +++ b/Probelm Starting.ipynb @@ -0,0 +1,673 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 783, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "import re\n", + "import os\n", + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 707, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "topdir ='/Users/keithcalise/Github/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/'\n", + "testdir = '/Users/keithcalise/GitHub/programming-language-classifier/test/'\n", + "file_extensions = {\".clj\":\"Clojure\",\n", + " \".cljs\": \"Clojure\",\n", + " \".clojure\": \"Clojure\",\n", + " \".hs\": \"Haskell\",\n", + " \".lhs\": \"Haskell\",\n", + " \".java\": \"Java\",\n", + " \".class\": \"Java\",\n", + " \".jar\": \"Java\",\n", + " \".js\": \"Javascript\",\n", + " \".javascript\": \"Javascript\",\n", + " \".pl\": \"Perl\",\n", + " \".pm\": \"Perl\",\n", + " \".t\": \"Perl\",\n", + " \".pod\": \"Perl\",\n", + " \".php\": \"PHP\",\n", + " \".php4\": \"PHP\",\n", + " \".perl\": \"Perl\",\n", + " \".phtml\": \"PHP\",\n", + " \".py\": \"Python\",\n", + " \".pyw\": \"Python\",\n", + " \".pyc\": \"Python\",\n", + " \".pyo\": \"Python\",\n", + " \".pyd\": \"Python\",\n", + " \".python3\": \"Python\",\n", + " \".Python2\": \"Python\",\n", + " \".rb\": \"Ruby\",\n", + " \".rbw\": \"Ruby\",\n", + " \".jruby\": \"Ruby\",\n", + " \".scala\": \"Scala\",\n", + " \".scm\": \"Scheme\",\n", + " \".ss\": \"Scheme\",\n", + " \".tcl\": \"Tcl\",\n", + " \".racket\": \"Scheme\",\n", + " \".ghc\": \"Haskell\",\n", + " \".Tcl\": \"Tool Command Language\"}\n", + "\n", + "\n", + "def tupler(dic):\n", + " keylist = []\n", + " for key in dic:\n", + " keylist.append(key)\n", + " tupl = tuple(keylist)\n", + " return tupl\n", + "\n", + "\n", + "def get_files_dirs(directory):\n", + " data = []\n", + " for dirpath, dirnames, names in os.walk(topdir):\n", + " for name in names:\n", + " if name.lower().endswith(tupler(file_extensions)):\n", + " with open(os.path.join(dirpath,name)) as current_file:\n", + " data.append(current_file.read())\n", + "\n", + " return data \n", + "\n", + "\n", + "def code_getter(directory):\n", + " \n", + " typelist = []\n", + " for dirpath, dirnames, names in os.walk(topdir):\n", + " for file_name in names:\n", + " name, extension = os.path.splitext(file_name)\n", + " if extension in file_extensions:\n", + "# print(file_extensions[extension])\n", + " typelist.append(file_extensions[extension])\n", + " return typelist" + ] + }, + { + "cell_type": "code", + "execution_count": 708, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def count_char(code, char):\n", + " counter = 0\n", + " for i in code:\n", + " if i == char:\n", + " counter+=1\n", + " return counter\n", + "def char_centage(code, char):\n", + " num = count_char(code, char)/len(code)\n", + " return num\n", + "def stringer(code, string):\n", + " num = len(re.findall(string, code))\n", + " return num\n", + "def string_centage(code,string):\n", + " num = stringer(code, string)/len(code)\n", + " return num\n", + "# string_centage(df.Code[3],'check')\n", + "# count_char(df.Code[3],\"-\")\n", + "# char_centage(df.Code[3],\"-\")" + ] + }, + { + "cell_type": "code", + "execution_count": 709, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 709, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "string_centage(df.loc[20]['Code'], 'elif')" + ] + }, + { + "cell_type": "code", + "execution_count": 710, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def df_maker(directory):\n", + " filename_list = get_files_dirs(directory)\n", + " typelist = code_getter(directory)\n", + " df = pd.DataFrame(typelist, index = range(352))\n", + " df.columns = [\"Language\"]\n", + " df['Code']=filename_list\n", + " df[\"Language\"]=df.Language.apply(lambda x:x.lower())\n", + " df['Code Length']=df.Code.apply(lambda x:len(x))\n", + " return df\n", + "df = df_maker(topdir)" + ] + }, + { + "cell_type": "code", + "execution_count": 711, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.00371900826446281" + ] + }, + "execution_count": 711, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "char_centage(df['Code'][0],\"^\")" + ] + }, + { + "cell_type": "code", + "execution_count": 712, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def carat(code):\n", + " return char_centage(code, '^')*100\n", + " \n", + "def ampersand(code):\n", + " return char_centage(code,\"&\")*100\n", + " \n", + "def double_equals(code):\n", + " return char_centage(code, '==')*100\n", + "\n", + "def semicolon(code):\n", + " return char_centage(code,';')*100\n", + "def exclamation(code):\n", + " return char_centage(code, '!')*100\n", + "def comma(code):\n", + " return char_centage(code,',')*100\n", + "def colon(code):\n", + " return char_centage(code,':')*100\n", + "def pipe(code):\n", + " return char_centage(code,'|')*100\n", + "\n", + "def elif_(code):\n", + " return string_centage(code, 'elif')*100\n", + "def if_(code):\n", + " return string_centage(code, 'if')*100\n", + "def return_(code):\n", + " return string_centage(code, 'return')*100\n", + "def println(code):\n", + " return string_centage(code,'println')*100\n", + "def open_curly(code):\n", + " return char_centage(code, '{')*100\n", + "def end(code):\n", + " return string_centage(code,'end')*100\n", + "def require(code):\n", + " return string_centage(code,'require')*100\n", + "def void(code):\n", + " return string_centage(code,'void')*100\n", + "# def column_generator(dataframe):\n", + "# punc_list = ['^',':',',','$','|','!','&']\n", + "# for punc in punc_list:\n", + "# for code in df.Code:\n", + "# code = code.split()\n", + "# dataframe[punc]=dataframe.Code.apply(char_centage(code, punc))\n", + "# return dataframe\n", + "# column_generator(df)\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 713, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df['^'] = df.Code.apply(carat)\n", + "df['&'] = df.Code.apply(ampersand)\n", + "df['=='] = df.Code.apply(double_equals)\n", + "df[';'] = df.Code.apply(semicolon)\n", + "df[':'] = df.Code.apply(colon)\n", + "df[','] = df.Code.apply(comma)\n", + "df['!'] = df.Code.apply(exclamation)\n", + "df['|'] = df.Code.apply(pipe)\n", + "df['elif'] = df.Code.apply(elif_)\n", + "df['if'] = df.Code.apply(if_)\n", + "df['return'] = df.Code.apply(return_)\n", + "df['println'] = df.Code.apply(println)\n", + "df['{'] = df.Code.apply(open_curly)\n", + "df['end']=df.Code.apply(end)\n", + "df['require'] = df.Code.apply(require)\n", + "df['void']=df.Code.apply(void)" + ] + }, + { + "cell_type": "code", + "execution_count": 714, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tester= pd.read_csv('test.csv', header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 715, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def test_code_getter(directory):\n", + " test_list = []\n", + " for subdir, dirs, files in os.walk(directory):\n", + " for filname in files:\n", + " with open(os.path.join(subdir, filname)) as current_file:\n", + " test_list.append(current_file.read())\n", + " return test_list" + ] + }, + { + "cell_type": "code", + "execution_count": 716, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "test_code_list = test_code_getter(testdir)" + ] + }, + { + "cell_type": "code", + "execution_count": 717, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tester['Tester_code'] = test_code_list" + ] + }, + { + "cell_type": "code", + "execution_count": 718, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tester[\"Code Length\"]= tester.Tester_code.apply(lambda x:len(x))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 719, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tester['^'] = df.Code.apply(carat)\n", + "tester['&'] = df.Code.apply(ampersand)\n", + "tester['=='] = df.Code.apply(double_equals)\n", + "tester[';'] = df.Code.apply(semicolon)\n", + "tester[':'] = df.Code.apply(colon)\n", + "tester[','] = df.Code.apply(comma)\n", + "tester['!'] = df.Code.apply(exclamation)\n", + "tester['|'] = df.Code.apply(pipe)\n", + "tester['elif'] = df.Code.apply(elif_)\n", + "tester['if'] = df.Code.apply(if_)\n", + "tester['return'] = df.Code.apply(return_)\n", + "tester['println'] = df.Code.apply(println)\n", + "tester['{'] = df.Code.apply(open_curly)\n", + "tester['end']=df.Code.apply(end)\n", + "tester['require'] = df.Code.apply(require)\n", + "tester['void'] = df.Code.apply(void)" + ] + }, + { + "cell_type": "code", + "execution_count": 720, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tester = tester.rename(columns = {1:'Language'},)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally ready for sci-kit learn...\n" + ] + }, + { + "cell_type": "code", + "execution_count": 765, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import sklearn.tree\n", + "import numpy as np\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.naive_bayes import MultinomialNB " + ] + }, + { + "cell_type": "code", + "execution_count": 766, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# X_train = df.loc[:,\"^\":]\n", + "# X_test = tester.loc[:,'^':]\n", + "# y_train = df.Language\n", + "# y_test = tester.Language\n" + ] + }, + { + "cell_type": "code", + "execution_count": 775, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df.loc[:,'^':], df.Language, test_size=0.40)" + ] + }, + { + "cell_type": "code", + "execution_count": 776, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tree = sklearn.tree.DecisionTreeClassifier()\n", + "tree = tree.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 777, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " clojure 0.85 0.69 0.76 16\n", + " haskell 1.00 0.79 0.88 14\n", + " java 0.93 1.00 0.96 25\n", + " javascript 0.36 0.45 0.40 11\n", + " perl 0.36 0.36 0.36 11\n", + " php 0.54 0.58 0.56 12\n", + " python 0.79 1.00 0.88 11\n", + " ruby 0.94 0.79 0.86 19\n", + " scala 0.71 0.77 0.74 13\n", + " scheme 0.88 0.78 0.82 9\n", + "\n", + "avg / total 0.77 0.75 0.76 141\n", + "\n", + "[[11 0 1 1 0 3 0 0 0 0]\n", + " [ 0 11 0 0 0 0 3 0 0 0]\n", + " [ 0 0 25 0 0 0 0 0 0 0]\n", + " [ 0 0 0 5 4 1 0 0 0 1]\n", + " [ 0 0 1 4 4 2 0 0 0 0]\n", + " [ 0 0 0 2 3 7 0 0 0 0]\n", + " [ 0 0 0 0 0 0 11 0 0 0]\n", + " [ 0 0 0 0 0 0 0 15 4 0]\n", + " [ 0 0 0 2 0 0 0 1 10 0]\n", + " [ 2 0 0 0 0 0 0 0 0 7]]\n" + ] + } + ], + "source": [ + "print(metrics.classification_report(y_test, tree.predict(X_test)))\n", + "print(metrics.confusion_matrix(y_test, tree.predict(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 778, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 778, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tree.score(X_train,y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "MultinomialNB" + ] + }, + { + "cell_type": "code", + "execution_count": 779, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "nb=MultinomialNB()" + ] + }, + { + "cell_type": "code", + "execution_count": 780, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" + ] + }, + "execution_count": 780, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 781, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.67375886524822692" + ] + }, + "execution_count": 781, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 782, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " clojure 0.80 0.75 0.77 16\n", + " haskell 0.91 0.71 0.80 14\n", + " java 0.60 1.00 0.75 25\n", + " javascript 0.00 0.00 0.00 11\n", + " perl 0.29 0.45 0.36 11\n", + " php 0.00 0.00 0.00 12\n", + " python 0.71 0.91 0.80 11\n", + " ruby 0.88 0.79 0.83 19\n", + " scala 0.67 0.92 0.77 13\n", + " scheme 0.86 0.67 0.75 9\n", + "\n", + "avg / total 0.60 0.67 0.62 141\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/keithcalise/GitHub/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/metrics/classification.py:958: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ] + } + ], + "source": [ + "print(metrics.classification_report(y_test,nb.predict(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}