Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moved JSON export code from JavaScript to C++ per #977 #984

Merged
merged 1 commit into from
Dec 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
"node-fetch": "^2.6.9",
"opencollective-postinstall": "^2.0.3",
"regenerator-runtime": "^0.13.3",
"tesseract.js-core": "^5.1.1",
"tesseract.js-core": "^6.0.0-0",
"wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1"
},
Expand Down
156 changes: 3 additions & 153 deletions src/worker-script/utils/dump.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,6 @@ const deindent = (html) => {
* @access public
*/
module.exports = (TessModule, api, output, options) => {
const ri = api.GetIterator();
const {
RIL_BLOCK,
RIL_PARA,
RIL_TEXTLINE,
RIL_WORD,
RIL_SYMBOL,
} = TessModule;
const blocks = [];
let block;
let para;
let textline;
let word;
let symbol;

const enumToString = (value, prefix) => (
Object.keys(TessModule)
.filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
Expand All @@ -79,142 +64,6 @@ module.exports = (TessModule, api, output, options) => {
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};

// If output.layoutBlocks is true and options.skipRecognition is true,
// the user wants layout data but text recognition has not been run.
// In this case, fields that require text recognition are skipped.
if (output.blocks || output.layoutBlocks) {
ri.Begin();
do {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if (TessModule.getPointer(poly) > 0) {
const n = poly.get_n();
const px = poly.get_x();
const py = poly.get_y();
polygon = [];
for (let i = 0; i < n; i += 1) {
polygon.push([px.getValue(i), py.getValue(i)]);
}
/*
* TODO: find out why _ptaDestroy doesn't work
*/
// TessModule._ptaDestroy(TessModule.getPointer(poly));
}

block = {
paragraphs: [],
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_BLOCK) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_BLOCK) : null,
baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon,
};
blocks.push(block);
}
if (ri.IsAtBeginningOf(RIL_PARA)) {
para = {
lines: [],
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_PARA) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_PARA) : null,
baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr(),
};
block.paragraphs.push(para);
}
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
// getRowAttributes was added in a recent minor version of Tesseract.js-core,
// so we need to check if it exists before calling it.
// This can be removed in the next major version (v6).
let rowAttributes;
if (ri.getRowAttributes) {
rowAttributes = ri.getRowAttributes();
// Descenders is reported as a negative within Tesseract internally so we need to flip it.
// The positive version is intuitive, and matches what is reported in the hOCR output.
rowAttributes.descenders *= -1;
}
textline = {
words: [],
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_TEXTLINE) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_TEXTLINE) : null,
baseline: ri.getBaseline(RIL_TEXTLINE),
rowAttributes,
bbox: ri.getBoundingBox(RIL_TEXTLINE),
};
para.lines.push(textline);
}
if (ri.IsAtBeginningOf(RIL_WORD)) {
const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],

text: !options.skipRecognition ? ri.GetUTF8Text(RIL_WORD) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_WORD) : null,
baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD),

is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),

is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
};
const wc = new TessModule.WordChoiceIterator(ri);
do {
word.choices.push({
text: !options.skipRecognition ? wc.GetUTF8Text() : null,
confidence: !options.skipRecognition ? wc.Confidence() : null,
});
} while (wc.Next());
TessModule.destroy(wc);
textline.words.push(word);
}

// let image = null;
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
symbol = {
choices: [],
image: null,
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_SYMBOL) : null,
confidence: !options.skipRecognition ? ri.Confidence(RIL_SYMBOL) : null,
baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
};
word.symbols.push(symbol);
const ci = new TessModule.ChoiceIterator(ri);
do {
symbol.choices.push({
text: !options.skipRecognition ? ci.GetUTF8Text() : null,
confidence: !options.skipRecognition ? ci.Confidence() : null,
});
} while (ci.Next());
// TessModule.destroy(i);
}
} while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri);
}

return {
text: output.text ? api.GetUTF8Text() : null,
hocr: output.hocr ? deindent(api.GetHOCRText()) : null,
Expand All @@ -227,8 +76,9 @@ module.exports = (TessModule, api, output, options) => {
imageGrey: output.imageGrey ? getImage(imageType.GREY) : null,
imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null,
confidence: !options.skipRecognition ? api.MeanTextConf() : null,
blocks: output.blocks && !options.skipRecognition ? blocks : null,
layoutBlocks: output.layoutBlocks && options.skipRecognition ? blocks : null,
blocks: output.blocks && !options.skipRecognition ? JSON.parse(api.GetJSONText()).blocks : null,
layoutBlocks: output.layoutBlocks && options.skipRecognition
? JSON.parse(api.GetJSONText()).blocks : null,
psm: enumToString(api.GetPageSegMode(), 'PSM'),
oem: enumToString(api.oem(), 'OEM'),
version: api.Version(),
Expand Down
Binary file added tests/assets/images/escape_chars.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions tests/recognize.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -269,4 +269,29 @@ describe('recognize()', () => {
}).timeout(TIMEOUT)
));
});

describe('should support blocks (json) output', () => {
it('recongize large image', async () => {
await worker.reinitialize('eng');
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/testocr.png`, {}, { blocks: true });
expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('T');
expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('This');
expect(blocks[0].paragraphs[0].lines[0].text).to.be('This is a lot of 12 point text to test the\n');
}).timeout(TIMEOUT);

it('recongize image with special characters', async () => {
await worker.reinitialize('eng');
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/escape_chars.png`, {}, { blocks: true });
expect(blocks[0].paragraphs[0].lines[0].text).to.be('"Double Quotes"\n');
expect(blocks[0].paragraphs[0].lines[1].text).to.be('Back \\ Slash\n');
}).timeout(TIMEOUT);

it('recongize chinese image', async () => {
await worker.reinitialize('chi_tra');
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/chinese.png`, {}, { blocks: true });
expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('繁');
expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('繁體');
expect(blocks[0].paragraphs[0].lines[0].text).to.be('繁體 中 文 測試\n');
}).timeout(TIMEOUT);
});
});
Loading