naptha · Balearica · Dec 25, 2024 · Dec 25, 2024
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -68,7 +68,7 @@
     "node-fetch": "^2.6.9",
     "opencollective-postinstall": "^2.0.3",
     "regenerator-runtime": "^0.13.3",
-    "tesseract.js-core": "^5.1.1",
+    "tesseract.js-core": "^6.0.0-0",
     "wasm-feature-detect": "^1.2.11",
     "zlibjs": "^0.3.1"
   },

diff --git a/src/worker-script/utils/dump.js b/src/worker-script/utils/dump.js
@@ -40,21 +40,6 @@ const deindent = (html) => {
  * @access public
  */
 module.exports = (TessModule, api, output, options) => {
-  const ri = api.GetIterator();
-  const {
-    RIL_BLOCK,
-    RIL_PARA,
-    RIL_TEXTLINE,
-    RIL_WORD,
-    RIL_SYMBOL,
-  } = TessModule;
-  const blocks = [];
-  let block;
-  let para;
-  let textline;
-  let word;
-  let symbol;
-
   const enumToString = (value, prefix) => (
     Object.keys(TessModule)
       .filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
@@ -79,142 +64,6 @@ module.exports = (TessModule, api, output, options) => {
     return TessModule.FS.readFile('/tesseract-ocr.pdf');
   };
 
-  // If output.layoutBlocks is true and options.skipRecognition is true,
-  // the user wants layout data but text recognition has not been run.
-  // In this case, fields that require text recognition are skipped.
-  if (output.blocks || output.layoutBlocks) {
-    ri.Begin();
-    do {
-      if (ri.IsAtBeginningOf(RIL_BLOCK)) {
-        const poly = ri.BlockPolygon();
-        let polygon = null;
-        // BlockPolygon() returns null when automatic page segmentation is off
-        if (TessModule.getPointer(poly) > 0) {
-          const n = poly.get_n();
-          const px = poly.get_x();
-          const py = poly.get_y();
-          polygon = [];
-          for (let i = 0; i < n; i += 1) {
-            polygon.push([px.getValue(i), py.getValue(i)]);
-          }
-          /*
-           * TODO: find out why _ptaDestroy doesn't work
-           */
-          // TessModule._ptaDestroy(TessModule.getPointer(poly));
-        }
-
-        block = {
-          paragraphs: [],
-          text: !options.skipRecognition ? ri.GetUTF8Text(RIL_BLOCK) : null,
-          confidence: !options.skipRecognition ? ri.Confidence(RIL_BLOCK) : null,
-          baseline: ri.getBaseline(RIL_BLOCK),
-          bbox: ri.getBoundingBox(RIL_BLOCK),
-          blocktype: enumToString(ri.BlockType(), 'PT'),
-          polygon,
-        };
-        blocks.push(block);
-      }
-      if (ri.IsAtBeginningOf(RIL_PARA)) {
-        para = {
-          lines: [],
-          text: !options.skipRecognition ? ri.GetUTF8Text(RIL_PARA) : null,
-          confidence: !options.skipRecognition ? ri.Confidence(RIL_PARA) : null,
-          baseline: ri.getBaseline(RIL_PARA),
-          bbox: ri.getBoundingBox(RIL_PARA),
-          is_ltr: !!ri.ParagraphIsLtr(),
-        };
-        block.paragraphs.push(para);
-      }
-      if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
-        // getRowAttributes was added in a recent minor version of Tesseract.js-core,
-        // so we need to check if it exists before calling it.
-        // This can be removed in the next major version (v6).
-        let rowAttributes;
-        if (ri.getRowAttributes) {
-          rowAttributes = ri.getRowAttributes();
-          // Descenders is reported as a negative within Tesseract internally so we need to flip it.
-          // The positive version is intuitive, and matches what is reported in the hOCR output.
-          rowAttributes.descenders *= -1;
-        }
-        textline = {
-          words: [],
-          text: !options.skipRecognition ? ri.GetUTF8Text(RIL_TEXTLINE) : null,
-          confidence: !options.skipRecognition ? ri.Confidence(RIL_TEXTLINE) : null,
-          baseline: ri.getBaseline(RIL_TEXTLINE),
-          rowAttributes,
-          bbox: ri.getBoundingBox(RIL_TEXTLINE),
-        };
-        para.lines.push(textline);
-      }
-      if (ri.IsAtBeginningOf(RIL_WORD)) {
-        const fontInfo = ri.getWordFontAttributes();
-        const wordDir = ri.WordDirection();
-        word = {
-          symbols: [],
-          choices: [],
-
-          text: !options.skipRecognition ? ri.GetUTF8Text(RIL_WORD) : null,
-          confidence: !options.skipRecognition ? ri.Confidence(RIL_WORD) : null,
-          baseline: ri.getBaseline(RIL_WORD),
-          bbox: ri.getBoundingBox(RIL_WORD),
-
-          is_numeric: !!ri.WordIsNumeric(),
-          in_dictionary: !!ri.WordIsFromDictionary(),
-          direction: enumToString(wordDir, 'DIR'),
-          language: ri.WordRecognitionLanguage(),
-
-          is_bold: fontInfo.is_bold,
-          is_italic: fontInfo.is_italic,
-          is_underlined: fontInfo.is_underlined,
-          is_monospace: fontInfo.is_monospace,
-          is_serif: fontInfo.is_serif,
-          is_smallcaps: fontInfo.is_smallcaps,
-          font_size: fontInfo.pointsize,
-          font_id: fontInfo.font_id,
-          font_name: fontInfo.font_name,
-        };
-        const wc = new TessModule.WordChoiceIterator(ri);
-        do {
-          word.choices.push({
-            text: !options.skipRecognition ? wc.GetUTF8Text() : null,
-            confidence: !options.skipRecognition ? wc.Confidence() : null,
-          });
-        } while (wc.Next());
-        TessModule.destroy(wc);
-        textline.words.push(word);
-      }
-
-      // let image = null;
-      // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
-      // var image = pix2array(pix);
-      // // for some reason it seems that things stop working if you destroy pics
-      // TessModule._pixDestroy(TessModule.getPointer(pix));
-      if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
-        symbol = {
-          choices: [],
-          image: null,
-          text: !options.skipRecognition ? ri.GetUTF8Text(RIL_SYMBOL) : null,
-          confidence: !options.skipRecognition ? ri.Confidence(RIL_SYMBOL) : null,
-          baseline: ri.getBaseline(RIL_SYMBOL),
-          bbox: ri.getBoundingBox(RIL_SYMBOL),
-          is_superscript: !!ri.SymbolIsSuperscript(),
-          is_subscript: !!ri.SymbolIsSubscript(),
-          is_dropcap: !!ri.SymbolIsDropcap(),
-        };
-        word.symbols.push(symbol);
-        const ci = new TessModule.ChoiceIterator(ri);
-        do {
-          symbol.choices.push({
-            text: !options.skipRecognition ? ci.GetUTF8Text() : null,
-            confidence: !options.skipRecognition ? ci.Confidence() : null,
-          });
-        } while (ci.Next());
-        // TessModule.destroy(i);
-      }
-    } while (ri.Next(RIL_SYMBOL));
-    TessModule.destroy(ri);
-  }
-
   return {
     text: output.text ? api.GetUTF8Text() : null,
     hocr: output.hocr ? deindent(api.GetHOCRText()) : null,
@@ -227,8 +76,9 @@ module.exports = (TessModule, api, output, options) => {
     imageGrey: output.imageGrey ? getImage(imageType.GREY) : null,
     imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null,
     confidence: !options.skipRecognition ? api.MeanTextConf() : null,
-    blocks: output.blocks && !options.skipRecognition ? blocks : null,
-    layoutBlocks: output.layoutBlocks && options.skipRecognition ? blocks : null,
+    blocks: output.blocks && !options.skipRecognition ? JSON.parse(api.GetJSONText()).blocks : null,
+    layoutBlocks: output.layoutBlocks && options.skipRecognition
+      ? JSON.parse(api.GetJSONText()).blocks : null,
     psm: enumToString(api.GetPageSegMode(), 'PSM'),
     oem: enumToString(api.oem(), 'OEM'),
     version: api.Version(),

diff --git a/tests/assets/images/escape_chars.png b/tests/assets/images/escape_chars.png
diff --git a/tests/recognize.test.js b/tests/recognize.test.js
@@ -269,4 +269,29 @@ describe('recognize()', () => {
       }).timeout(TIMEOUT)
     ));
   });
+
+  describe('should support blocks (json) output', () => {
+    it('recongize large image', async () => {
+      await worker.reinitialize('eng');
+      const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/testocr.png`, {}, { blocks: true });
+      expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('T');
+      expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('This');
+      expect(blocks[0].paragraphs[0].lines[0].text).to.be('This is a lot of 12 point text to test the\n');
+    }).timeout(TIMEOUT);
+
+    it('recongize image with special characters', async () => {
+      await worker.reinitialize('eng');
+      const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/escape_chars.png`, {}, { blocks: true });
+      expect(blocks[0].paragraphs[0].lines[0].text).to.be('"Double Quotes"\n');
+      expect(blocks[0].paragraphs[0].lines[1].text).to.be('Back \\ Slash\n');
+    }).timeout(TIMEOUT);
+
+    it('recongize chinese image', async () => {
+      await worker.reinitialize('chi_tra');
+      const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/chinese.png`, {}, { blocks: true });
+      expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('繁');
+      expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('繁體');
+      expect(blocks[0].paragraphs[0].lines[0].text).to.be('繁體 中 文 測試\n');
+    }).timeout(TIMEOUT);
+  });
 });