From 213231a6593698e0b67c750ad9bcafc9da10cf6e Mon Sep 17 00:00:00 2001 From: Balearica Date: Tue, 24 Dec 2024 18:26:50 -0800 Subject: [PATCH] Moved JSON export code from JavaScript to C++ --- package-lock.json | 15 +-- package.json | 2 +- src/worker-script/utils/dump.js | 156 +-------------------------- tests/assets/images/escape_chars.png | Bin 0 -> 10945 bytes tests/recognize.test.js | 25 +++++ 5 files changed, 37 insertions(+), 161 deletions(-) create mode 100644 tests/assets/images/escape_chars.png diff --git a/package-lock.json b/package-lock.json index 54796c233..bc5efec02 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,7 +16,7 @@ "node-fetch": "^2.6.9", "opencollective-postinstall": "^2.0.3", "regenerator-runtime": "^0.13.3", - "tesseract.js-core": "^5.1.1", + "tesseract.js-core": "^6.0.0-0", "wasm-feature-detect": "^1.2.11", "zlibjs": "^0.3.1" }, @@ -8752,9 +8752,10 @@ } }, "node_modules/tesseract.js-core": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-5.1.1.tgz", - "integrity": "sha512-KX3bYSU5iGcO1XJa+QGPbi+Zjo2qq6eBhNjSGR5E5q0JtzkoipJKOUQD7ph8kFyteCEfEQ0maWLu8MCXtvX5uQ==" + "version": "6.0.0-0", + "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-6.0.0-0.tgz", + "integrity": "sha512-aygT8yEywDjH48qiSbM0Qmxnmju6b3Nfyp+gy+l1D4CrPbn69y/8OgoG2Edtz4HtDn6iwnjR/TcRs1+njXCqWQ==", + "license": "Apache-2.0" }, "node_modules/test-exclude": { "version": "6.0.0", @@ -16184,9 +16185,9 @@ } }, "tesseract.js-core": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-5.1.1.tgz", - "integrity": "sha512-KX3bYSU5iGcO1XJa+QGPbi+Zjo2qq6eBhNjSGR5E5q0JtzkoipJKOUQD7ph8kFyteCEfEQ0maWLu8MCXtvX5uQ==" + "version": "6.0.0-0", + "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-6.0.0-0.tgz", + "integrity": "sha512-aygT8yEywDjH48qiSbM0Qmxnmju6b3Nfyp+gy+l1D4CrPbn69y/8OgoG2Edtz4HtDn6iwnjR/TcRs1+njXCqWQ==" }, "test-exclude": { "version": "6.0.0", diff --git a/package.json b/package.json index 64d4f58bc..7aa636497 100644 --- a/package.json +++ b/package.json @@ -68,7 +68,7 @@ "node-fetch": "^2.6.9", "opencollective-postinstall": "^2.0.3", "regenerator-runtime": "^0.13.3", - "tesseract.js-core": "^5.1.1", + "tesseract.js-core": "^6.0.0-0", "wasm-feature-detect": "^1.2.11", "zlibjs": "^0.3.1" }, diff --git a/src/worker-script/utils/dump.js b/src/worker-script/utils/dump.js index 4bdeaf586..1eed4e575 100644 --- a/src/worker-script/utils/dump.js +++ b/src/worker-script/utils/dump.js @@ -40,21 +40,6 @@ const deindent = (html) => { * @access public */ module.exports = (TessModule, api, output, options) => { - const ri = api.GetIterator(); - const { - RIL_BLOCK, - RIL_PARA, - RIL_TEXTLINE, - RIL_WORD, - RIL_SYMBOL, - } = TessModule; - const blocks = []; - let block; - let para; - let textline; - let word; - let symbol; - const enumToString = (value, prefix) => ( Object.keys(TessModule) .filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value)) @@ -79,142 +64,6 @@ module.exports = (TessModule, api, output, options) => { return TessModule.FS.readFile('/tesseract-ocr.pdf'); }; - // If output.layoutBlocks is true and options.skipRecognition is true, - // the user wants layout data but text recognition has not been run. - // In this case, fields that require text recognition are skipped. - if (output.blocks || output.layoutBlocks) { - ri.Begin(); - do { - if (ri.IsAtBeginningOf(RIL_BLOCK)) { - const poly = ri.BlockPolygon(); - let polygon = null; - // BlockPolygon() returns null when automatic page segmentation is off - if (TessModule.getPointer(poly) > 0) { - const n = poly.get_n(); - const px = poly.get_x(); - const py = poly.get_y(); - polygon = []; - for (let i = 0; i < n; i += 1) { - polygon.push([px.getValue(i), py.getValue(i)]); - } - /* - * TODO: find out why _ptaDestroy doesn't work - */ - // TessModule._ptaDestroy(TessModule.getPointer(poly)); - } - - block = { - paragraphs: [], - text: !options.skipRecognition ? ri.GetUTF8Text(RIL_BLOCK) : null, - confidence: !options.skipRecognition ? ri.Confidence(RIL_BLOCK) : null, - baseline: ri.getBaseline(RIL_BLOCK), - bbox: ri.getBoundingBox(RIL_BLOCK), - blocktype: enumToString(ri.BlockType(), 'PT'), - polygon, - }; - blocks.push(block); - } - if (ri.IsAtBeginningOf(RIL_PARA)) { - para = { - lines: [], - text: !options.skipRecognition ? ri.GetUTF8Text(RIL_PARA) : null, - confidence: !options.skipRecognition ? ri.Confidence(RIL_PARA) : null, - baseline: ri.getBaseline(RIL_PARA), - bbox: ri.getBoundingBox(RIL_PARA), - is_ltr: !!ri.ParagraphIsLtr(), - }; - block.paragraphs.push(para); - } - if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { - // getRowAttributes was added in a recent minor version of Tesseract.js-core, - // so we need to check if it exists before calling it. - // This can be removed in the next major version (v6). - let rowAttributes; - if (ri.getRowAttributes) { - rowAttributes = ri.getRowAttributes(); - // Descenders is reported as a negative within Tesseract internally so we need to flip it. - // The positive version is intuitive, and matches what is reported in the hOCR output. - rowAttributes.descenders *= -1; - } - textline = { - words: [], - text: !options.skipRecognition ? ri.GetUTF8Text(RIL_TEXTLINE) : null, - confidence: !options.skipRecognition ? ri.Confidence(RIL_TEXTLINE) : null, - baseline: ri.getBaseline(RIL_TEXTLINE), - rowAttributes, - bbox: ri.getBoundingBox(RIL_TEXTLINE), - }; - para.lines.push(textline); - } - if (ri.IsAtBeginningOf(RIL_WORD)) { - const fontInfo = ri.getWordFontAttributes(); - const wordDir = ri.WordDirection(); - word = { - symbols: [], - choices: [], - - text: !options.skipRecognition ? ri.GetUTF8Text(RIL_WORD) : null, - confidence: !options.skipRecognition ? ri.Confidence(RIL_WORD) : null, - baseline: ri.getBaseline(RIL_WORD), - bbox: ri.getBoundingBox(RIL_WORD), - - is_numeric: !!ri.WordIsNumeric(), - in_dictionary: !!ri.WordIsFromDictionary(), - direction: enumToString(wordDir, 'DIR'), - language: ri.WordRecognitionLanguage(), - - is_bold: fontInfo.is_bold, - is_italic: fontInfo.is_italic, - is_underlined: fontInfo.is_underlined, - is_monospace: fontInfo.is_monospace, - is_serif: fontInfo.is_serif, - is_smallcaps: fontInfo.is_smallcaps, - font_size: fontInfo.pointsize, - font_id: fontInfo.font_id, - font_name: fontInfo.font_name, - }; - const wc = new TessModule.WordChoiceIterator(ri); - do { - word.choices.push({ - text: !options.skipRecognition ? wc.GetUTF8Text() : null, - confidence: !options.skipRecognition ? wc.Confidence() : null, - }); - } while (wc.Next()); - TessModule.destroy(wc); - textline.words.push(word); - } - - // let image = null; - // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL) - // var image = pix2array(pix); - // // for some reason it seems that things stop working if you destroy pics - // TessModule._pixDestroy(TessModule.getPointer(pix)); - if (ri.IsAtBeginningOf(RIL_SYMBOL)) { - symbol = { - choices: [], - image: null, - text: !options.skipRecognition ? ri.GetUTF8Text(RIL_SYMBOL) : null, - confidence: !options.skipRecognition ? ri.Confidence(RIL_SYMBOL) : null, - baseline: ri.getBaseline(RIL_SYMBOL), - bbox: ri.getBoundingBox(RIL_SYMBOL), - is_superscript: !!ri.SymbolIsSuperscript(), - is_subscript: !!ri.SymbolIsSubscript(), - is_dropcap: !!ri.SymbolIsDropcap(), - }; - word.symbols.push(symbol); - const ci = new TessModule.ChoiceIterator(ri); - do { - symbol.choices.push({ - text: !options.skipRecognition ? ci.GetUTF8Text() : null, - confidence: !options.skipRecognition ? ci.Confidence() : null, - }); - } while (ci.Next()); - // TessModule.destroy(i); - } - } while (ri.Next(RIL_SYMBOL)); - TessModule.destroy(ri); - } - return { text: output.text ? api.GetUTF8Text() : null, hocr: output.hocr ? deindent(api.GetHOCRText()) : null, @@ -227,8 +76,9 @@ module.exports = (TessModule, api, output, options) => { imageGrey: output.imageGrey ? getImage(imageType.GREY) : null, imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null, confidence: !options.skipRecognition ? api.MeanTextConf() : null, - blocks: output.blocks && !options.skipRecognition ? blocks : null, - layoutBlocks: output.layoutBlocks && options.skipRecognition ? blocks : null, + blocks: output.blocks && !options.skipRecognition ? JSON.parse(api.GetJSONText()).blocks : null, + layoutBlocks: output.layoutBlocks && options.skipRecognition + ? JSON.parse(api.GetJSONText()).blocks : null, psm: enumToString(api.GetPageSegMode(), 'PSM'), oem: enumToString(api.oem(), 'OEM'), version: api.Version(), diff --git a/tests/assets/images/escape_chars.png b/tests/assets/images/escape_chars.png new file mode 100644 index 0000000000000000000000000000000000000000..35ee09daa62ff109613daa7bbd5611b0debcce8c GIT binary patch literal 10945 zcmc(_S2)~X)cBi(=sm$0(V|8#gJ1}QN%R`M*HNNIZ^7s-qKh_25J7aJCCFg3LG<25 zl;|~_Z+`#x<~+|i@5Q+}7nuE+-PYc#?9W;;n(EIy8iv#?Qt!& zxpRkGPf0;e*Vk+}m+;uMi=l@h%G&j%bM<~e>68)|Sv4DyRrkjzH`pOgZznO5h%x}7 z9YYa7uP8rj;Pi?DLM280b98FPz|m)R*lTm$XLGR4XD`6@#`WS>W^;h$R@^85BFH}Y z=zW&oV@0ipiiFWb358s-R1c}aK$D$|8xmW{rL3uoc&MnAT2PAR0-ADKDBzn~VTG_F zpoyy(0sq%A_W#~dF2Zi&v*C#O?d2o_9ZxqxYaW<D2ieVetP*!BLvL%b*u2;jNp4oelS@2Vl8DPnv%89Hr}& zcBQTVMwb1`VqK1pz)v!~huIysfA>eY%Rf^MiVQ=wk{G26-B;1<9Rus_S9@`K#;@w1 zl1FY{9}Op&e_cb)Xx2120wv&nSRHrPi)$4}UL@D(8K^OTm z)kIy%H2Cy)Z_a3J;NO$U5t*xtS{I;Z&i6`w7JJnK6kt-D#a!$U);)8&x1B^cdc^ISjF)bzIXZhq;LKr^X&Uo=Xu|m zoMS3hlgupmV*On5+GkL7Qq{C%J_1j?XGm(_#4k@|=qaoJ(V%#W`8-xTXs>y*5QiN4 z#p4kS>bWt%7B%_=Srfkdf&zsS-mM!`FUyFMyWOcBe)nkc%VO}=heL~K#4Fm+mHt&k zmi0hj^U;u$wR`d=GlSn+@(8_aYx3H*%+A-}m1SoXm^R;0CC(3(W0f%ZxKucCg-)Jz zvLa1PW&7p!ntq77^vTaA&LN4XlT5egJ&DJ|SBL8v=4jGBbw7O6u7wn(bNmMKf%=O! zXna<}E8oETtN72(<Ajn@V3 z*%~Qw5>AXa?W-{t?=QIo91soYfx`B>srwJ=J^@d^z0TLT4-QhT@prbAjO1ZFzCi81N^`%>UIFGeqA6P-J4igxlH8q;{zcDR$v24VRp zd{uNhW;#hNzK4h3F-$>FH<`YPZ31o)KGJ*LZB(S`Iz2-8{cVY{^tBBK`X{E{` z#fXofSzEYAy}DEAVdL3CK*>;o`%3nF=o6if#YV@#$P^f|h6i#H=q4M`k?&@%&%2JH zh;)*a$zJ&%hbJmxMu_x%V&}5W`P`2-zX3Gi(&HdGFW8Jk~EkQg{#{iig}pn}A(?X&4r! zuyi+#rT;c@%Kee4!!LEgzVX@f&1biCLFopgnKk#}l|LUSkYM<)B!FGterp@~$vd2CdJ3k?%eoVLxA0Jy2P8Nl(0*=poRWWC>>TJMm@K?4Tic4AI5=;*`mMJ+>gBR|%mD*;QihW&sr?TSu%KyQ&_tkEre!3S63v^U= zGOnM?%rknd{xfC}3fXkXA%Ubp8$%L^^$Fo1*aI*H;wi!>Ds*6Tg19TzZcB=Jp zg{wCEwdH1AJA&qxY_d2c2ETuP9SZ6_Gf?3Ar~B04v3g#jahnkpy@nI^RP53-uZj%qkHy4Zm-s=#x0-(;^~{5&P@xYa`B)0+rp)aN3Jg? zJ1W1D@k`Qvyw9sc2~soK;^;`iGH<0awF@OALgk|^Gs0&A$@Aafr7W_UPtq;sF!E2X z%@|&9h<7Lv;`Ct8YnP4R_ja}};9rz7Z3HU*^ESZkCsgD@p~e*^jq~0kQC`hfh^(w@ zP*I^*B?Z}>%tk9Qf0M%$z5b-+B`ul6?WAJ4XLgGiZE)=dPQ zY`^T=XN5G*dmH}{?K_t<>mtyycU#QK?{@4~T!(FNUJ#HDTbYbTyO@Qqw4E))nMNIg zeL>C?`P6y_EH~0sS~w6cHFEVQ@Cf)DmJj3n!)k@bE@!%e{>Lv=^On1!I6Kx>iBblM zsFt!#j5kgw!Lv*0z%_YrrbyGYwR4j70`T(BT~c)oQI>9Wca8mjeJBFLFv$xiF~I)$ z9#^;q@pzG<^|wR+dahll9$t{y%C(wITIFSshSVNWThh-xMp-I>Bs%aZ4?^Lg5|KY! zeD>#ISE7>lBb$nKT5F#7y?rG zVBPUn`eML-Tc5y%rc$xM`#Qi>^q^-y8p6WK+e^>hNXb^3sB#ON*hXwDg_9>9y{a35 zn^lq zgzB@Lh&WyADHXw3d#S*kk+w=*jHhU zi(?};N3j&ViK*l;7l}ZNIdf3P&LK?p7}7c1DYX$Mq?I+Wa2^9j4axOXopg;aS7cIw9#9&yjHD zD1w3>wz}?R^B5Z{IYID#%v2La*_2*@5_O^toE6Myli@4bZJ6cvO7y1AF4HObnCP|3 zjQ9f>dGTaB4yF`6cP9w3#5wPN*~pZ!k9{DF?nTt3q^@51Wd8mO`mH{{-AVRj{5r^h}C6-%O@lT@?C$2MqGzieX> z_L2VNg2v24ZIC0P1{I3>SD zQDQ%+!&+~0pMb+opdYNO@vR?yTT>Y+YKB)Jt^VPIWIt{(KvUV88VHvbLE7|EB=3~M z2Fjm=^Ff-_oM%S7&|$HLKPBPvR`X?c-~WEG>1bu5I#LBkI61$$&rA5h=ikBcX#X4y zPw7di(YyXnlqYjT<_&vk5ft{F)4P7w99kJ!`X6nIU`HXP1GQrqc@ol8&v@}!O z1*fVuv5R`p*NyH!@2%Y@n>X_7!m+(Co-8PtO;uH4zkLhE(eZPTNM|TvdA2C(%8wKP zfoBss8AW$q>If*L303Rny=P1{X&U7y#x@laD*2sR%fC*Cru$@wCO$bzK1AZ};IXJY zPD{YEhNeqAHH7VjJt0%2s1ImVo%7z!DH~9X*tpf)@;}|llH`2i^t=`(l1J*1~9Mj6?uVOpLrrKZ=^A6sm_Y{AYr~k?_30$cqkSe10 zQEc-Em$}#Y%f2cX7!hgHLP%Cx1f5Pw&mQo01J15Du=p`5nFp2P>%&XEM!ZNRF0q~j zH#anh?I8nWJczj$vAYqGdIAC6y+=Z`(gN8LG^Nvd<7{DgCM}!I1@HM*PioT(s_)0F ziLlPoWEa{>0ue@GS22Z^>Y-7(Ei#fqy9HlP+~0WD_NhJEvx3?7X!xAu+CWQ;pI;j_V+G)km!K|xp3{x$E5UPpWTQ}y{BuF ztg9X}8L-BzpAEX_^hfMku~JuX6Wb*SrugPTf%O`7)x}ESp$3gVF#s~1k|v;P>nObH zTB=E;j;u+(324Vp@!Ac5^~S{q(s+2;P=(LR*iL(^CC4VFI3bg#*aWILEyN8Brl;y# z+liZNa8@FFZCR@1xT(M`H<@+){4EdLaggs!Jq=y4G-{Yl@{secHS-I%q{A7OQ_gL+ z8Er&n5dSj|KA3#)>|091PMHVo76rEb7;%z}v}A{McizG*d_0{6>xFAllUE(r>jLG0 zQv-)Xo|HeiA*}HD=p)))9jh8qXrnKOoouSXCt_XG}CatI~q>&u)g=2$H%FX@S#HDz17K|3W5cRV? zz1tx!k`<9|xLd2#9U3e;QQazL{7HC+|uHK^)($n0$R9Z#&TbM2x`Fq$w- zzOzSz_&wvj_Mq8yIV;{!MgkSYR9bwf>a<8#=AXvT?)S#1>{T2q*KyEjF_LytTz>}; zxu3I{Rfu=H7oy3>Kd#27tBImgGa&h8Uj72VR1GF!b&HhYHCFBe>-L-b&vuztRrnX%Cu6AIF%?C?h!3 zve!)F;*l(cvhP$?&}j7ih1n2ul%b8wd>%;7!`$s}NNS zXh@&vVt!b0k`VHRta5k|lA;sREBfoBv~0xH0@MVIdYeG;1gy{j1{8auoV6Kp7;FQg zkI#gP6N;0qLKOG!O~y!{XQ!PHMXP3g8TT*X6n`~S=L;)p#y5%amatKMlh-t-7us9$ zYpjH};kEx&T!dYi>3%scBSRK7ug;47f)vR});W?0)pF?|y*y1Pb(f{?lU95IyP*EgxQu<{8&&U0-e zDnzRbm?~NeT|%TaM+CWaU;H92(X_JR+R~jeDhR{+rwCm)#kkQ@#&8&$!4ZPobp$~H z>L74;#s|?%I)3AAHlvz&R_x`8Lzz`UBpJi@HDjeNL?fJT>UhTOcFA7+%$6iTT6zrr z?X#EvQ^hs`5^B5}Es7@*0jYTSn$WV0NenR^3^ZkTN%K%ymoIP>Z{?|A#YZI3_Vp~kG`hw8! zZ_9LCB2}d=W9inA3X+PaM9lm$t23RZFpa{A^zu-CDfB>2h7f_i0mKC_*xux0{P5zp zAgWJl?BbJ;N3J|C8d|7u)2f9?Qkn%a%0C%e91A+4w_2#I+4(b6EB+*0OvG;<$Z!Ru z%C)o}xcZw9{B=*GH>Y@n$@9BqyD@e06IojMSm`Fy{mGdb|0GP*_opWi9cdW`BDs&KQPj!7uFF2y{SNGcqYgp%I5(@(@r2*(Lo>h>W8?3v(B7{ zuwz>rFr}&tk205X%Rg3aN3t-y;ZKo(j(^S3 zDSZ1woL67Vhyg(04tFW2Hv$Fq!>6~JrYM*=$hnnUivBn3{r{Pl|L+~g!oGxDGx*F= zGI;c|9O!ibTpDqxGAR^%eSSVu^gK!ZW)v!OdwsOE+qjr`bK2PPgFZZ~dvPjiapQMY zM`E7uO3YR`xy*~3cfrO zI-TZRp#G9q2N2jFv(qx?OW`BWnSxSt+K!h0vhV{;%%$$zJ>u@jc0-s*!0FEU%zNh= zrNfNM&Dvp^xPL4N@w=iial+WO?WYvWLF1tKB2@BdKtyMjaX3qab9=tUXE~tASuGp# zWX7YPuZTU}pwMH%n-;t<8+0~L+tCPs+4e&dZisR%!9xJBVs zEELRl#)Xid!m}T+La#s-m zM5MtI9yWDbg>iQ6;{Jl-3%=GS8+vAb>zDwsGSi5SIDz))%8r}M^OlO{jqI1hA7Dk0 zxPnR%6MBLu( zziU(KKknW{a%kR?Q(udPUhUK#c(&bK{vH7^P?kSGHTZ)eE>n@j^`9=Ch+JgIJU;ip zo4Yul5<7Nk`mL--V*U)A9*LWW(z`_Ohq@8&zd*$ zjCE3%Q=jNqeb+_-Niw7G21+p6ap#DFY+%W+WW*6B&n0-~zLxwPZT$lErAIlFwW~?R zFuwd{1i)+@NN3;((8Gq|S~S?H+wyxZyqqvWXb$=5K`*3wl&dWAeV{iLk794uts<5SaJm{9glf?q5e18Y4jv$NJ!Q~r zWm52+s-_j-?JdiM`hLP&)m7`Gh(wLHA?Sew&D7?!-TPOqXhiAG)qY1v;6N--C5WDd zJFQKj2lxk#G=LXB%*n5NKsssV3eo+PMy1}(1u62dsqg&b>CxMByCmsTgEBXJb4TiIiuKq-_wkqdKgtcjEcjM>UmZzXbs3E8{+l zS+_0yrkMI`L=et?^nfv*I9AWwFhhqi`kS^2A9c$x=6DG6ttaXvWNU;~avAvm z%y>sT4vXZ{Z`IEummfZq`bUP94Hf~CeEu~M-*dqT8M(mHreIvRqhLtNvZH)f#E0Af z>{g{m8CIEB_-Wl!+qi@XJOC~@NM{EgD^SEh1&K=X{2g=hw;DfR&lLsj6WC0=D z)>JP6Cm>@c&i7D0iY%c@=kzLV9Y_(x(-lnlcxZx zw+c03U6Yj-9Luq1gdjArTOxXFPXP<1v;R#Uv<6)9Y8IAB+XxQbW~+72LwAyWW@3N3 zPH#`R4#emeGfVBp(H zw)F_iKgU=Wp;(;Lw<_{1-fN3uRv0GFibBK> zn*F5KdYS6)5R3hy$(q(j;(<|cB9rGh!)FmXb?al#=pM0ee_<~6Yc=VadT<$@Pr^X| z_K0CG4WG=>W5KYdfB_*Waesx478@iw_E#UXm??{}tcrhC+i5I~iQ9KoA1zkPfk>>Q z#86tAZ^yB)6I}*34}=9nz4tT)zi|%8c&A*y8%zqhUeyK`R8e~K1wR!Ud07e^)k75O z*ITDie8|sT?aXJ(&qIe->GdC<~Zn7WBj358vS+FDLeVCPN|Ym{-dc z2+=^eje+zZ;e1U1m;8yi`>HZ%#-W5ggpmB6xZmO0l;&u8+CEyHwiq4QWsmn%e{>uYp1K3nS)wvFkOD z04RPClq$55u$HE87PA=P%bA#Sp2{)Ti^{XvJ50wC}fHp-HQ1NaNsyYI&L5 zvE;z(LY6!VYkCD8Q2RqlA}AP*f$Jl)J)qS2LJzO64(Is8-@K2NK32=CqymR=PSK*T z=`#%V&p6nSG-G2nIb1f!hKvM!Bf&siW3+AB*L`LgUlI)`P?KKAd}~|LOTSQq>G}X? zqU}|p1xumbDd3U;eAJyvVeT(M{6-CmDVTu*;Z-8tsedWhg>bXT50~eoWyQLBDJA75 z_B+}b-!~WHBX9*9QQ~fD)jN4hMoQl`A^blav%9^Dde1_MW<4D_-tska- z>uE}Rj=G3cJyZqqU%kr9@bY5bE;pbeLf?d(B65C_R2^nkoWc2B;6^?+GKs4ir(>@ybf)nxrU2!t9&YV{vwBK!Hv z?aTjXOLYsTB0sW%URm$=8rG#oCdn0yiZJsWSvpVeHIXR1Rm*uQbfGV52*Uv>&W30t zk+IH>3yOU|F*IzUWQKr-3K?H8er{r3BrQvZJt#D}fc?Q?89oeTLpz(u(fxNgday2q z);#F|z1$@B9d=zw5zjoNr5)Onu^n$2h;dZbS*tZ(WNnoLB{DxI`yXo{{`84N!EX{< zxSU+HUl?36o~hm5{3XT_4itpRC!zGt$&Q8Bt}L}Cpn(w_HqD0-e;gicxM-)p0m^|Y zV8($;DB+Ti*llH=o5S z^A26ccpi8m56h)9P#}|3mD^aHC;&%9f6W%LS3jIQ6x_$O(zd_Mb0Yq4!j;+Tq=cl| zx58YerG}QL|8oK5@7K-i6$7a?i9R7(?gwd@HA~(i1a@JP8T-kWMJ+1>nam}OAn0~! z*`*PvP0{y@L*ixDAsXkhoOEB4Cm?-7XCv>B7PfEyARz5;5!5fC;o_`e`L<1^cuu6& z>k$!Zf1SKBnAxN<$h_{C7SeR@ON7H3SU;ORq@6^8B?Z*Ew~d=vCgFB0#{tcK&m*TT=f z>V8!yls($(cMOdP2(=s-{c5`JORHz?1Fjfp<3Mx(I+4LxBC=bIfFG95W4+`)c=;{c zZzJdP!mfwur+@NyLs{0KOx^kkNGrs-OWG)DsC_pEvWD2<=k$>p8zpJ|jK8Udf`HHn z?Lsii?6!!949#UCC21vRl7XfzvYGfD8yGPEDT#jNW4?oK9WL!VRoce=U7!fI-A5o*JTg=bz zB^oFnx_fhX9=hn2*bZ{}@kZvnN-3l8qc~JD^W#kejli}9agPNv&!eSQvz;oH*4eY; z5g%%YBze5mD2({NXMWK^IhkqbS<%s<5r$5fQ=iI)94$*j*gIySnlP|!Z74L-hypG=f+l4L2ry&X3w zm{c{8Sd(Mks6ip~h3F1X58^_&U1#z>c z^?>khX3O=feD@NhM<`f|L{(+tkAVkvZun=hIoAHO+R2uOdL(3RAb&f*v!vfYPOd8z z1Tb!Zf=eAuJ*mTH%6WjvPZaW5Ag!MC@h749j39B@DadfEc}f#M4B|W`ShcSxV7G-u z4f(7qpNd3q=@SeBF9Dj9-C1#$N0|tjY3q;9T4*jOLt}&08BghOo~#Q+f|lPt`j3lE zwJ=nH@gJ#~E#Ab&F#j>1Q>)y)F9XyM8vN|lpV(FCr$T*r>;WY_GhbNP&%IY+ZPbUW z^s}}YiYHl4dlzIMEWSUmJX; x{;PxBK>h!Kfc}?S%m1An{XcYk%XP_A<*vVsEAm_yC_BBQq^Pb?39|_MzW_=VB+~!@ literal 0 HcmV?d00001 diff --git a/tests/recognize.test.js b/tests/recognize.test.js index 2474a305e..0b7def911 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -269,4 +269,29 @@ describe('recognize()', () => { }).timeout(TIMEOUT) )); }); + + describe('should support blocks (json) output', () => { + it('recongize large image', async () => { + await worker.reinitialize('eng'); + const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/testocr.png`, {}, { blocks: true }); + expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('T'); + expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('This'); + expect(blocks[0].paragraphs[0].lines[0].text).to.be('This is a lot of 12 point text to test the\n'); + }).timeout(TIMEOUT); + + it('recongize image with special characters', async () => { + await worker.reinitialize('eng'); + const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/escape_chars.png`, {}, { blocks: true }); + expect(blocks[0].paragraphs[0].lines[0].text).to.be('"Double Quotes"\n'); + expect(blocks[0].paragraphs[0].lines[1].text).to.be('Back \\ Slash\n'); + }).timeout(TIMEOUT); + + it('recongize chinese image', async () => { + await worker.reinitialize('chi_tra'); + const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/chinese.png`, {}, { blocks: true }); + expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('繁'); + expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('繁體'); + expect(blocks[0].paragraphs[0].lines[0].text).to.be('繁體 中 文 測試\n'); + }).timeout(TIMEOUT); + }); });