From 7e976a5f0722fe0ac08dd8d8755c9961e4a39698 Mon Sep 17 00:00:00 2001 From: Hannah Date: Sun, 7 Jun 2015 22:09:34 -0400 Subject: [PATCH 1/2] python files --- language_classifier/classifier | Bin 0 -> 13435 bytes language_classifier/gather_data.py | 46 ++++++++++ language_classifier/lang_classifier.py | 113 +++++++++++++++++++++++++ language_classifier/predict.py | 17 ++++ 4 files changed, 176 insertions(+) create mode 100644 language_classifier/classifier create mode 100644 language_classifier/gather_data.py create mode 100644 language_classifier/lang_classifier.py create mode 100644 language_classifier/predict.py diff --git a/language_classifier/classifier b/language_classifier/classifier new file mode 100644 index 0000000000000000000000000000000000000000..425a6535e48107682cea26f638a589e9b0740b1f GIT binary patch literal 13435 zcmbt*30zHE^#5(5LE~-iPW42lL_}2kYEVQfQ$ll8nr@S#bSWVuQF_LZRAeYmin=R$ z8G4yQNs|l}BFU6j{^z=9AMfS)z4!n9_Q&aa*IM7T_TFpnv-i1YyL<`n@Ku4no?*d; zq5h%1f&Rh1vX1xzk2Y#OPb`inZr~zEqtSvrgM5A5!y|k{!+8>MJV^r=X+pcg*E1qA z%r~4TfQ?sXSbnyFSd!t zbmp-P>?K^p2(1~9?cm@r`zukt{jsM-@Hk#utfC1UjmDqN{f4v|BII*Pc@l~CBx3zc zQpS91k<2Icd@GU6btd)L&z$r#=JIqSczRwgGBl#a{t@n>Vg4auyk9oEC<_&$so@?N zvXU@_c)9b2`>FbU%Ywcik%AG?Ps*6nsGqZ>A34Wt?yVgU+cm2 zayDSRpyM@8 zEv?VryA6Yn8UE?%dzC=V&Tn;af-?9qt}8as6+w3RDmN#tB+!P5r@@O>LH4V+8Lh=% z1qJaNyJ;I4RNUPff9Bu5A`i>f9C(o|P=VJ&zt~weD?!4Nol)zgmEiKT6$_3B>Hxb` zMV1lEfmif>`OXc}5Rg$KUF)w7^(zVvcDQ7s9 z^A;OcjE-&=BlI>#c1zqsI8c38UQ%i*2g+WIeeL7JftzzyT=jp(24wrD<~V_Oj%~SV z8qJ~b@e0T0dQT3NpX{Feb|rynhSL4s9C%))yiGcg0~&f2b9&ZtApDFRt8O(1wl^4V zu<_zRVIylgtzrn(pQ+QgMcp(}0mDd(Q#yP)Z0;;fXVTjR-J1{X@mWrX3q3a#l^bfpyLE{J?NF%;Z0hdAEfPHIhr#3*^7!CF_}^99vm{G4A9UrM>Ie@Opk(wUHMa zE^8Uj>)OMHSF2+ZAE&aZ_N_c&Z2Bu9hu36%&M;;}@937--47L@K9^gsxK0_8toQ7n zHlPgPS-DQ*7ez?(ij%0clYk3V%N{ydcM2LU%B!9Re-${Ks(Pe(OPz|lV0ZF>bu}F@ z>K!i^?EXUmCdWN2t9v(u(qsGo71igSI2bFX49ni8+vKd2{Lk^P8x`-dy+McIcgno` z`E+P{IP+rFOBFCN&`(jSX-Cu z9`Rm}!I4G7X|-DlK~mZJO=yjWuX=VgD$ zZ&sP+cU!y zp%aKLL!|_6?$>;yDX$A3E;gO^?B&2*%{$ovBXps875A;8@hv zqMiIiaA~FyyYq3MAWXaUZO3Ld6*vBQYKHEgT2MAF^h0L_gTfc{Bwr>Pt3d4hO&23a zeH9G0pW*CkZ?YUh@v*({RpsJ0%QPu{=#7ZzhfeC0jCnsp92fiHcssXz;skYTgUQvH z%+G}k;CYyGj`wMU-lkPEcTCcO_>UWx_^)HY$&s9j#sdskvwY2^?)y4$c}`+V(XR{$ zdZ^{1rSl!#rQ>fc@6-W|ehP(ut{>2W&VahP#p}Q4L8esyX28Yoa?+0WIa|^h&~uu# zYF_{YZn4S}Pk;ieEi zoT5(IF)ylRcgoAVL#Q~6oUDsy-)RHJ!EteYTE%|jmhNRiM&taWvUY5EbIb0{kRleu z*_2B(X0srEUT$C5JT`3e%b60nj|EwWb(-2fus}zPJKV{J4X$T7PcISAPZ(|dEPG6x z*?VyfT`24lthBE47#n3nnVG1UnG>%*GWd zokacDPG6t=KwlD?-71!Lj?<+2?{!;zzsnI7FuF7{J+%9Q;AREw%{obyAM%0oVqJ1^ z!-iROkbKLzrg&W*60)Nb1bJOo-CnaDA9P6P)s&)ZZWd9jga2 z`l1t=fH9$?23Z(0p?&dc&Dty-C^kNlBeR+b@zoEUlYE#^QdT(f<0d9Z$joo2Z)C#! z-%hVJY-d34@d<_5cV*yQ-udVg)6^jH*s5P`#FSxT*|Px40eOg)-@3eRk2vs6Ojhz& z_6oAM?{)P~RR+~uhC`5>GL@IMu9NLa&(z>8@W+F=3jytMSqfZA!*^jIa7va4yaLD8TZ+;cr$cy#}+i0N4L-{L+J66Dpi_;J3hr zkWX8P{3vdHt;=G-^kSp+4`dh+w(y{-nIDnor_=Sf$}mBP>$-Uxkteuy?xUD06Dnt; zrBas|6uLbFvxZ~_U^E+hV`%#rCJfh~a&Nme1H7BV-4_7^VphBF8RN->){2aoCUV-~ zUj2z?b&$x%gF7w~j!bYgVx=xMkb@S6o6=$%d0;ypm=Nr(2-7s%K8)Ql49=v*FWgY5 z3`(OT+CGSN3d&zjcDfu)qw5`Cgf{?g?92pMF}m+e$A{wSzkd1Zklht z)yY(aT}Sty_2^Kc?6^L~H&xv!?ce=yTq4uf!Y%?85)lh3~XRx8JS-kKG zv2M}cpIH@9E(ax<#_zREm0;nrGN(*EC0L~C7?7tb2ZvGV@S(32L203$z3sIpg3XsE z2VXb*D!}={{X6QKU32)B0%)_h4G;6yfgKTNXZ|`)1r|5AZd^T}PU*27<6wJZ-O0+l z1DZeOV?WHpi1io;$Mu-?T2f1e4mo>6XSyucf#}{1^6TAopkn+V$6&J-c*Xp=>u9SM zczph_++v;%xaft4ovP6SCy6luOlcipct7$BoTCG}w)L|U%CspoH`J7nU7`aRq4!wi zzD6x*zHQpD`WNDPKtIpZ@18c?mJ2SA8l^+^Yu?Z|i?0&xh>Mza7czwY{coQs<2Fb#k!A$Vi5g` zsW5hSpTN*Z%A`dw49ezSj`IAd0WKde@7sC1M-Y2eZ=n8*IH(;6xjea#0rQ_a=nV+i z6yEgu^i=wTIv5mBuC6pxqVyTrPBX3rs8aG^KP(SfKR>%`o))AYJ(?h~REOf@xR{6W z-mtHIuEhLQ&V_|`iEKEa`(%>LQwG?q$~($_qXQj2{RzyaY~uU&rW(1sESRuAS8`MV z3+AS(>o2fk!}^W=eFJ7JKnZ76&zckSy&z$@U*dNRIsV)3qbFGqV{TUwOYAFsXMfj?qM>>u_lh<$W<+Fq6KC^>3Zf^roTFqR}4mnUpx z!XegTwMUSwMbLdgQ$PA&2`@aCfxHdQoWNV z3!m1VJ@>p?k-{-j!!07aKc|N+;i*Nv-|p20p?>pmgksm9%=Pd z8?5X;YgsH+gsbRMhmsAkZj|2SZCObN%p36_Q~vW1NlM0gj5sd#!*L(e3{AX0GGNN8 z_0uP3GGUmK!7ZmHOrTj_$`?0eLPAb}{E;I}DC2$I#F1b^H|P3Dx!Fw6s`Ngj8_R^_ zFMLKlWPV3nA4aWH)B1~6GU3ZyOF41kIpEruL}%-B47hLb_4fTdCd6>+tCL}v*A(jV>OC*#Ak8M zmNXf_{c{@d$!fQx14erBMlL%}iITDYzoN8Bt@Wv6GXHD;!S&$ws6Gwf-DN_Dkb#<{ zCJQ>$?sv&sdqV}f2Nqh6$xwk)M_l%9)1yP>rzbpfRl;6APWz-19nKqFwbvkUopfpC ztY8%iW9K^S%M$N_FrJB_r8GZPfl=o!JoKDOhp4>Xv3h^1K*0T9JYLi3P%}b2F!`1; zGF41sN9`q)jOx9xd3otr`W~x=w#DB- zIP)eF#{_=D0|I*kdr5og2%fRm7E9r2f;U0^FGa`;B(4zR3LvgX;tC@!U*h^}5%R)` z%a6D`iA%@{C9c5*+lAa!#3hU)j4!~q`c{N7gl!kr^D~Z+DC7zI{6E!+c`?KXVOxcK zVLO6}Yj6?PCmaJW;u7+IR&XAK{SdZKi2qbDPgtk0Kf*o_9$TTGuw8gA2<4v@-lPcL zWYNi*XJDwGC(pzdb&JT~&iO>AW}-u(DGn|Qgb5!Ckw6LWqt zTMu{GJ<(v7ZYSGT5yf|&6cnbSQ7(Gkx$w$vmsJTDdnT=WlUmd9` z%tb~HtE;aq$U<(3@AhnHnS>T@+9dN)e?2mDpS^4*KVLKs{8fMN4?bh1k@Kj2KWR57 zkt4#6L{g@aWpEx;N&5&9wyq#$l1>3RpZRzm;Q37=cLC{VAVR?R2Ngi{fs5w>*5f#& zA5EJ3?0VaQT9xN$cWSiCZmAT67U?0fThoO2Z$}IrXMP5azGOdRp7JsD zVH7VB~D&YD&5g^v{Ku4S*tFD3PJME#@A$Z@{O`OFCx*%`Y;n7Ueo zG;%y>KP@D0@Otr6{{N=|=Xp3;&%e8ycHUR2Ix`R5$cxdNz9tK)7~eKVp!=~)puqky3AGfCkqIkr>Zn63AEz+6t~z9xIb{9&*ZIhPuF0Iq)uqVlVatr5 ztIN^4O+m$@GqR}s!^Esg$A+!Q>GN@%W|K(X=iE?{ydg(~ttq4( z&qo?r2FqAajw^p9>4)_sQtQqj{SSG?>;>aflm$Scu6enRRQD(~;2({-;Zuc2fw4@QGYK@NKIwRdjC zwezGN=iP!U>d;gY@qSJ2a}Y1md*FNyT9^3qNIyLh0)8&S>pR{*j7c6IH@tq+$b#=P zoWDW!U>e7ZEBc7zQ1=6=9~`$E2FwQZ$ zyfrTQ2C7dgTOt2A4}HnLm3zsh45i$fN;g__93|X{x4yT^6>*=|i&>{9BGtLSv>3Zo ziN?XhpKLeYS8@NRi1bRIX1@+$OzL&|{LA=abSAWVAI1QKtb$Sfttftts79-4^S&9@X` z7WsU_i6#4k=QG|vXk@|j9q$MDdjlR<%=;m~*dRh~H23d{f0al+uK(x!Uio;2+UZVY zy5#iZIetgb-FLq%(#vQ@8$N3&_?x$)3sPojboD~SZuigp;#7ss@yD-y>r;)w`uDyl zyHSbC{QInpRqjye(A>rzrg$G=+%S7&<%DGwNKy+Vg0&x^Keo$^e_nbUt$)?Aw&6<= z;@HO86rcZy&UH4mSkO{Y5lt^+@39M_`+<7S$IsbZvR`;zBJ*CqNuF?z6N1Db7Whr`KNPeip^`dYRlG@cK=Df5QJ)6Bdg4#V5z3F@uKWM`G9#Cf1q*yu3>P4pIWbCt%*kC%*(&b%nU(ZYepC| zTiQ@o%?cL@k0#Wo#8%T?xf`u|c)$51s|8gb@p7zsT#OEm?vIbP+k>2BrcJa%3uC{>8H;vYlRV_8^VicP2 zVTMM5%^C83a38k$gVPTKo~p%rMqo88y*otbEe%+l*_GqTY! z&3j*_-y-)1E}r+~_gOyq9s5YZb> zt?+slO7=X|k+AFza!cJE-|=WPTC{}Kd1v)PYJIwW($D!->q%1f;r=}y-UrEb;mlUj zj@KoU4UVK9=k4!NXw-c&SpQ#TJdXdB|8Kl*r1`CEM%r%d-*y^>BAe~Qc#%B8U0Aw$xDA-}+?NtEY7c zm~@wjkBJ;3`cMm&!O_YdC(c)rnSJo5;ig_nyG!4$nn7yXce_=SZ#58?ObT;Y53 z8O}UQ@?^-0cx!GIZ9gTx_=%we&{lLDP`oq*^ePAvBKEP4Atxm%e(qJ3h8!~~aL(_~rLB_$&I+w!m7`KHT6KBJ_(L=*UR`AE z{8}+-xB>gM9(cqTdS5T+XvK@RRZzn#8m7Q}k0#;?44n^fLT^ DARV9b literal 0 HcmV?d00001 diff --git a/language_classifier/gather_data.py b/language_classifier/gather_data.py new file mode 100644 index 0000000..4600492 --- /dev/null +++ b/language_classifier/gather_data.py @@ -0,0 +1,46 @@ +import pandas as pd +import glob +from bs4 import BeautifulSoup +import os + +extension_dict = {'gcc': 'c', 'perl': 'perl', 'clojure': 'clojure', 'hs': 'haskell', 'java': 'java', + 'javascript': 'javascript', 'jruby': 'ruby', 'yarv': 'ruby', 'ocaml': 'ocaml', + 'sbcl': 'lisp', 'scala': 'scala', 'csharp': 'csharp', 'hack': 'php', 'php': 'php', + 'python3': 'python', 'racket': 'scheme', 'tcl': 'tcl'} + +def get_test_data(): + content = [] + for file in sorted(os.listdir("../data/test/"), key=int): + with open("../data/test/" + file) as fh: + content.append([fh.read()]) + test_data = pd.DataFrame(content) + return test_data + +def get_code_from_html(lang): + htmlfiles = glob.glob("../data/html/*.html") + texts = [] + tags = [] + for file in htmlfiles: + soup = BeautifulSoup(open(file)) + html_tag = soup.find_all('pre', {'class' : '{} highlighted_source'.format(lang)}) + html_text = [part.get_text() for part in html_tag] + for tag in html_tag: + tags.append(lang) + texts.extend(html_text) + return texts, tags + +def get_benchmark_code(directory): + files = glob.glob("../data/corpus/{}/*.{}".format(directory, directory)) + texts = [] + tags = [] + for file in files: + with open(file) as fh: + tags.append(extension_dict[directory]) + texts.append(fh.read()) + return texts, tags + +def get_snippet(filename): + content = [] + with open(filename) as fh: + content.append([fh.read()]) + return content \ No newline at end of file diff --git a/language_classifier/lang_classifier.py b/language_classifier/lang_classifier.py new file mode 100644 index 0000000..93e35a0 --- /dev/null +++ b/language_classifier/lang_classifier.py @@ -0,0 +1,113 @@ +import re +import itertools +import random +import pickle +import pandas as pd + +from sklearn.pipeline import Pipeline +from sklearn.cross_validation import train_test_split +from sklearn.naive_bayes import MultinomialNB +from sklearn.base import TransformerMixin + +import gather_data as gd + +def count_characters(text): + return len(text) + +def count_words(text): + words = [r'\barray\b', r'\bbegin\b', r'\bend\b', r'\bdo\b', r'\bvar\b', r'\bdefn\b', r'\bfunction\b', + r'\bclass\b', r'\brequire\b', r'\bval\b', r'\bpublic\b', r'\blet\b', r'\bwhere\b', r'\busing\b', + r'\bextend\b', r'\bfunction\b', r'\bval\b', r'\btry\b'] + results = [] + for word in words: + results.append(len(re.findall(word, text))) + return results + +def char_runs(text): + chars = [r'[)]+',r'[}]+', r'[\]]+', r'[=]+'] + results = [] + for char in chars: + found = sorted(re.findall(char, text), key=len) + if found: + results.append(len(found[-1])) + else: + results.append(0) + return results + +def percent_characters(text): + chars = ';!=.<>/\[]{}:_#%$&*' + results = [] + for char in chars: + total = max(1, len(text)) + found = text.count(char) + if found: + results.append(found / total) + else: + results.append(0) + return results + +def endings(text): + ends = [r'[)]$', r';$', r'}$', r']$', r'\):$'] + results = [] + for end in ends: + results.append(len(re.findall(end, text, re.MULTILINE))) + return results + + +class FunctionFeaturizer(TransformerMixin): + def __init__(self, *featurizers): + self.featurizers = featurizers + + def fit(self, X, y=None): + return self + + def transform(self, X): + fvs = [] + for datum in X: + vec = list(itertools.chain.from_iterable([function(datum) for function in self.featurizers])) + fvs.append(vec) + return fvs + +class PipelineDebugger(TransformerMixin): + def __init__(self, name): + self.name = name + + def fit(self, X, y=None): + return self + + def transform(self, X): + print(self.name) + print("=" * 40) + x = X[random.randrange(0, len(X))] + print("len:", len(x)) + print(x) + return X + +if __name__ == '__main__': + texts = [] + tags = [] + + languages = ['c', 'perl', 'clojure', 'haskell', 'java', 'javascript', 'ruby', 'ocaml', 'lisp', 'scala', 'csharp', 'php', 'python', 'scheme', 'tcl'] + for language in languages: + texts.extend(gd.get_code_from_html(language)[0]) + tags.extend(gd.get_code_from_html(language)[1]) + + folders = ['clojure', 'csharp', 'gcc', 'hack', 'hs', 'java', 'javascript', 'jruby', 'ocaml', 'perl', 'php', 'python3', 'racket', 'sbcl', 'scala', 'yarv'] + for folder in folders: + tags.extend(gd.get_benchmark_code(folder)[1]) + texts.extend(gd.get_benchmark_code(folder)[0]) + + df_texts = pd.DataFrame(texts) + print(df_texts.head()) + df_tags = pd.DataFrame(tags) + merged = pd.merge(df_texts, df_tags, left_index=True, right_index=True) + merged.columns = ['Snippet', 'Language'] + + train_X, test_X, train_y, test_y = train_test_split(merged['Snippet'], merged['Language'], test_size=0.33) + + classifier = Pipeline([('features', FunctionFeaturizer(count_words, percent_characters, char_runs, endings)), + ('bayes', MultinomialNB())]) + classifier.fit(train_X, train_y) + + with open("./classifier", "wb") as file: + pickle.dump(classifier, file) \ No newline at end of file diff --git a/language_classifier/predict.py b/language_classifier/predict.py new file mode 100644 index 0000000..682dd5f --- /dev/null +++ b/language_classifier/predict.py @@ -0,0 +1,17 @@ +import gather_data as gd +import pandas as pd +import sys +import pickle + +def predict(classifier, data): + prediction = classifier.predict(data) + print(prediction) + + +if __name__ == '__main__': + content = gd.get_snippet(sys.argv[1]) + df = pd.DataFrame(content) + with open("./classifier", "rb") as file: + predictor = pickle.load(file) + predict(predictor, df) + From 516d492c89c58e834496846d9488448c766e5dce Mon Sep 17 00:00:00 2001 From: Hannah Date: Sun, 7 Jun 2015 22:09:58 -0400 Subject: [PATCH 2/2] tests --- .../tests/test_lang_classifier.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 language_classifier/tests/test_lang_classifier.py diff --git a/language_classifier/tests/test_lang_classifier.py b/language_classifier/tests/test_lang_classifier.py new file mode 100644 index 0000000..4306da8 --- /dev/null +++ b/language_classifier/tests/test_lang_classifier.py @@ -0,0 +1,21 @@ +from language_classifier.lang_classifier import * + +test_data = [] +data_lang = 'python' + +with open("language_classifier/tests/feature_test.txt") as file: + test_file = file.read() + +def test_total_characters(): + assert count_characters(test_file) == 32 + +def test_percent_char(): + assert percent_character(test_file, '.') == 6/32 + assert percent_character(test_file, ';') == 7/32 + assert percent_character(test_file, '\t') == 4/32 + +def test_count_vars(): + assert count_vars(test_file) == 2 + +def test_percent_word_chars(): + assert count_word_chars(test_file) == 6/32 \ No newline at end of file