Skip to content

Commit

Permalink
Added ability for box files to store spaces and newlines
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1060 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
[email protected] committed Apr 23, 2014
1 parent c3b9b7c commit 8364f24
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 23 deletions.
25 changes: 20 additions & 5 deletions ccmain/applybox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,9 @@ static double MedianXHeight(BLOCK_LIST *block_list) {
return xheights.median();
}

// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
BLOCK_LIST *block_list) {
// Any row xheight that is significantly different from the median is set
// to the median.
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
double median_xheight = MedianXHeight(block_list);
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
// Strip all fuzzy space markers to simplify the PAGE_RES.
Expand All @@ -212,6 +211,22 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
}
row->set_x_height(static_cast<float>(median_xheight));
}
}
}
}

// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
BLOCK_LIST *block_list) {
PreenXHeights(block_list);
// Strip all fuzzy space markers to simplify the PAGE_RES.
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
ROW* row = r_it.data();
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
Expand Down Expand Up @@ -434,7 +449,7 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
if (!box.major_overlap(block->bounding_box()))
continue;
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
ROW* row = r_it.data();
if (!box.major_overlap(row->bounding_box()))
continue;
Expand Down
4 changes: 4 additions & 0 deletions ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,10 @@ class Tesseract : public Wordrec {
PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
BLOCK_LIST *block_list);

// Any row xheight that is significantly different from the median is set
// to the median.
void PreenXHeights(BLOCK_LIST *block_list);

// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES* SetupApplyBoxes(const GenericVector<TBOX>& boxes,
Expand Down
93 changes: 81 additions & 12 deletions ccstruct/boxread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,7 @@ static const char* kMultiBlobLabelCode = "WordStr";

// Open the boxfile based on the given image filename.
FILE* OpenBoxFile(const STRING& fname) {
STRING filename = fname;
const char *lastdot = strrchr(filename.string(), '.');
if (lastdot != NULL)
filename[lastdot - filename.string()] = '\0';

filename += ".box";
STRING filename = BoxFileName(fname);
FILE* box_file = NULL;
if (!(box_file = fopen(filename.string(), "rb"))) {
CANTOPENFILE.error("read_next_box", TESSEXIT,
Expand All @@ -46,6 +41,70 @@ FILE* OpenBoxFile(const STRING& fname) {
return box_file;
}

// Reads all boxes from the given filename.
// Reads a specific target_page number if >= 0, or all pages otherwise.
// Skips blanks if skip_blanks is true.
// The UTF-8 label of the box is put in texts, and the full box definition as
// a string is put in box_texts, with the corresponding page number in pages.
// Each of the output vectors is optional (may be NULL).
// Returns false if no boxes are found.
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages) {
GenericVector<char> box_data;
if (!tesseract::LoadDataFromFile(BoxFileName(filename), &box_data))
return false;
return ReadMemBoxes(target_page, skip_blanks, &box_data[0], boxes, texts,
box_texts, pages);
}

// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages) {
STRING box_str(box_data);
GenericVector<STRING> lines;
box_str.split('\n', &lines);
if (lines.empty()) return false;
int num_boxes = 0;
for (int i = 0; i < lines.size(); ++i) {
int page = 0;
STRING utf8_str;
TBOX box;
if (!ParseBoxFileStr(lines[i].string(), &page, &utf8_str, &box)) {
continue;
}
if (skip_blanks && utf8_str == " ") continue;
if (target_page >= 0 && page != target_page) continue;
if (boxes != NULL) boxes->push_back(box);
if (texts != NULL) texts->push_back(utf8_str);
if (box_texts != NULL) {
STRING full_text;
MakeBoxFileStr(utf8_str.string(), box, target_page, &full_text);
box_texts->push_back(full_text);
}
if (pages != NULL) pages->push_back(page);
++num_boxes;
}
return num_boxes > 0;
}

// Returns the box file name corresponding to the given image_filename.
STRING BoxFileName(const STRING& image_filename) {
STRING box_filename = image_filename;
const char *lastdot = strrchr(box_filename.string(), '.');
if (lastdot != NULL)
box_filename.truncate_at(lastdot - box_filename.string());

box_filename += ".box";
return box_filename;
}

// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
// Box files are used ONLY DURING TRAINING, but by both processes of
// creating tr files with tesseract, and unicharset_extractor.
// ReadNextBox factors out the code to interpret a line of a box
Expand Down Expand Up @@ -78,8 +137,9 @@ bool ReadNextBox(int target_page, int *line_number, FILE* box_file,
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
// Check for blank lines in box file
while (*buffptr == ' ' || *buffptr == '\t')
buffptr++;
if (*buffptr == '\n' || *buffptr == '\0') continue;
// Skip blank boxes.
if (*buffptr == ' ' || *buffptr == '\t') continue;
if (*buffptr != '\0') {
if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) {
tprintf("Box file format error on line %i; ignored\n", *line_number);
Expand Down Expand Up @@ -113,18 +173,25 @@ bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
// as whitespace by sscanf, so it is more reliable to just find
// ascii space and tab.
int uch_len = 0;
while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
uch_len < kBoxReadBufSize - 1) {
// Skip unicode file designation, if present.
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3;
// Allow a single blank as the UTF-8 string. Check for empty string and
// then blindly eat the first character.
if (*buffptr == '\0') return false;
do {
uch[uch_len++] = *buffptr++;
}
} while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
uch_len < kBoxReadBufSize - 1);
uch[uch_len] = '\0';
if (*buffptr != '\0') ++buffptr;
int x_min, y_min, x_max, y_max;
*page_number = 0;
int count = sscanf(buffptr, "%d %d %d %d %d",
&x_min, &y_min, &x_max, &y_max, page_number);
if (count != 5 && count != 4) {
tprintf("Bad box coordinates in boxfile string!\n");
tprintf("Bad box coordinates in boxfile string! %s\n", ubuf);
return false;
}
// Test for long space-delimited string label.
Expand All @@ -148,6 +215,8 @@ bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
used += new_used;
}
*utf8_str = uch;
if (x_min > x_max) Swap(&x_min, &x_max);
if (y_min > y_max) Swap(&y_min, &y_max);
bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);
return true; // Successfully read a box.
}
Expand Down
25 changes: 25 additions & 0 deletions ccstruct/boxread.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#define TESSERACT_CCUTIL_BOXREAD_H__

#include <stdio.h>
#include "genericvector.h"
#include "strngs.h"

class STRING;
Expand All @@ -30,8 +31,32 @@ class TBOX;
const int kBoxReadBufSize = 1024;

// Open the boxfile based on the given image filename.
// Returns NULL if the box file cannot be opened.
FILE* OpenBoxFile(const STRING& fname);

// Reads all boxes from the given filename.
// Reads a specific target_page number if >= 0, or all pages otherwise.
// Skips blanks if skip_blanks is true.
// The UTF-8 label of the box is put in texts, and the full box definition as
// a string is put in box_texts, with the corresponding page number in pages.
// Each of the output vectors is optional (may be NULL).
// Returns false if no boxes are found.
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages);

// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages);

// Returns the box file name corresponding to the given image_filename.
STRING BoxFileName(const STRING& image_filename);

// ReadNextBox factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
// This function returns the next valid box file utf8 string and coords
Expand Down
50 changes: 44 additions & 6 deletions training/boxchar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <stddef.h>

#include "fileio.h"
#include "ndminx.h"

namespace tesseract {

Expand Down Expand Up @@ -77,23 +78,60 @@ void BoxChar::RotateBoxes(float rotation,
boxaDestroy(&rotated);
}

const int kMaxLineLength = 1024;
// Helper appends a tab box to the string to indicate a newline. We can't use
// an actual newline as the file format is line-based text.
static void AppendTabBox(const Box* box, int height, int page, string* output) {
char buffer[kMaxLineLength];
int nbytes = snprintf(buffer, kMaxLineLength, "\t %d %d %d %d %d\n",
box->x + box->w, height - box->y - box->h,
box->x + box->w + 10, height - box->y, page);
output->append(buffer, nbytes);
}

/* static */
void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
const vector<BoxChar*>& boxes) {
string output;
const int kMaxLineLength = 1024;
char buffer[kMaxLineLength];
for (int i = 0; i < boxes.size(); ++i) {
if (boxes[i]->box_ != NULL) {
const Box* box = boxes[i]->box_;
if (box != NULL) {
if (i > 0 && boxes[i - 1]->box_ != NULL &&
boxes[i - 1]->page_ == boxes[i]->page_ &&
box->x + box->w < boxes[i - 1]->box_->x) {
// We are on a newline. Output a tab character to indicate the newline.
AppendTabBox(boxes[i - 1]->box_, height, boxes[i]->page_, &output);
}
int nbytes = snprintf(buffer, kMaxLineLength,
"%s %d %d %d %d %d\n",
boxes[i]->ch_.c_str(),
boxes[i]->box_->x,
height - boxes[i]->box_->y - boxes[i]->box_->h,
boxes[i]->box_->x + boxes[i]->box_->w,
height - boxes[i]->box_->y,
box->x, height - box->y - box->h,
box->x + box->w, height - box->y,
boxes[i]->page_);
output.append(buffer, nbytes);
} else if (i > 0 && boxes[i - 1]->box_ != NULL) {
int j = i + 1;
// Find the next non-null box, as there may be multiple spaces.
while (j < boxes.size() && boxes[j]->box_ == NULL) ++j;
if (j < boxes.size() && boxes[i - 1]->page_ == boxes[j]->page_) {
const Box* prev = boxes[i - 1]->box_;
const Box* next = boxes[j]->box_;
if (next->x + next->w < prev->x) {
// We are on a newline. Output a tab character to indicate it.
AppendTabBox(prev, height, boxes[j]->page_, &output);
} else {
// Space between words.
int nbytes = snprintf(buffer, kMaxLineLength,
" %d %d %d %d %d\n",
prev->x + prev->w,
height - MAX(prev->y + prev->h,
next->y + next->h),
next->x, height - MIN(prev->y, next->y),
boxes[i - 1]->page_);
output.append(buffer, nbytes);
}
}
}
}
File::WriteStringToFileOrDie(output, filename);
Expand Down

0 comments on commit 8364f24

Please sign in to comment.