tesseract  4.1.1
imagedata.cpp
Go to the documentation of this file.
1 // File: imagedata.cpp
3 // Description: Class to hold information about a single multi-page tiff
4 // training file and its corresponding boxes or text file.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "imagedata.h"
25 
26 #if defined(__MINGW32__)
27 #include <unistd.h>
28 #else
29 #include <thread>
30 #endif
31 
32 #include "allheaders.h" // for pixDestroy, pixGetHeight, pixGetWidth, lept_...
33 #include "boxread.h" // for ReadMemBoxes
34 #include "callcpp.h" // for window_wait
35 #include "helpers.h" // for IntCastRounded, TRand, ClipToRange, Modulo
36 #include "rect.h" // for TBOX
37 #include "scrollview.h" // for ScrollView, ScrollView::CYAN, ScrollView::NONE
38 #include "serialis.h" // for TFile
39 #include "tprintf.h" // for tprintf
40 #include <cinttypes> // for PRId64
41 
42 // Number of documents to read ahead while training. Doesn't need to be very
43 // large.
44 const int kMaxReadAhead = 8;
45 
46 namespace tesseract {
47 
48 WordFeature::WordFeature() : x_(0), y_(0), dir_(0) {
49 }
50 
51 WordFeature::WordFeature(const FCOORD& fcoord, uint8_t dir)
52  : x_(IntCastRounded(fcoord.x())),
53  y_(ClipToRange<int>(IntCastRounded(fcoord.y()), 0, UINT8_MAX)),
54  dir_(dir) {
55 }
56 
57 // Computes the maximum x and y value in the features.
59  int* max_x, int* max_y) {
60  *max_x = 0;
61  *max_y = 0;
62  for (int f = 0; f < features.size(); ++f) {
63  if (features[f].x_ > *max_x) *max_x = features[f].x_;
64  if (features[f].y_ > *max_y) *max_y = features[f].y_;
65  }
66 }
67 
68 // Draws the features in the given window.
70  ScrollView* window) {
71 #ifndef GRAPHICS_DISABLED
72  for (int f = 0; f < features.size(); ++f) {
73  FCOORD pos(features[f].x_, features[f].y_);
74  FCOORD dir;
75  dir.from_direction(features[f].dir_);
76  dir *= 8.0f;
77  window->SetCursor(IntCastRounded(pos.x() - dir.x()),
78  IntCastRounded(pos.y() - dir.y()));
79  window->DrawTo(IntCastRounded(pos.x() + dir.x()),
80  IntCastRounded(pos.y() + dir.y()));
81  }
82 #endif
83 }
84 
85 // Writes to the given file. Returns false in case of error.
86 bool WordFeature::Serialize(FILE* fp) const {
87  return tesseract::Serialize(fp, &x_) &&
88  tesseract::Serialize(fp, &y_) &&
89  tesseract::Serialize(fp, &dir_);
90 }
91 
92 // Reads from the given file. Returns false in case of error.
93 bool WordFeature::DeSerialize(bool swap, FILE* fp) {
94  if (!tesseract::DeSerialize(fp, &x_)) return false;
95  if (swap) ReverseN(&x_, sizeof(x_));
96  return tesseract::DeSerialize(fp, &y_) &&
97  tesseract::DeSerialize(fp, &dir_);
98 }
99 
101  const GenericVector<WordFeature>& word_features,
102  GenericVector<FloatWordFeature>* float_features) {
103  for (int i = 0; i < word_features.size(); ++i) {
105  f.x = word_features[i].x();
106  f.y = word_features[i].y();
107  f.dir = word_features[i].dir();
108  f.x_bucket = 0; // Will set it later.
109  float_features->push_back(f);
110  }
111 }
112 
113 // Sort function to sort first by x-bucket, then by y.
114 /* static */
115 int FloatWordFeature::SortByXBucket(const void* v1, const void* v2) {
116  const auto* f1 = static_cast<const FloatWordFeature*>(v1);
117  const auto* f2 = static_cast<const FloatWordFeature*>(v2);
118  int x_diff = f1->x_bucket - f2->x_bucket;
119  if (x_diff == 0) return f1->y - f2->y;
120  return x_diff;
121 }
122 
123 ImageData::ImageData() : page_number_(-1), vertical_text_(false) {
124 }
125 // Takes ownership of the pix and destroys it.
126 ImageData::ImageData(bool vertical, Pix* pix)
127  : page_number_(0), vertical_text_(vertical) {
128  SetPix(pix);
129 }
131 }
132 
133 // Builds and returns an ImageData from the basic data. Note that imagedata,
134 // truth_text, and box_text are all the actual file data, NOT filenames.
135 ImageData* ImageData::Build(const char* name, int page_number, const char* lang,
136  const char* imagedata, int imagedatasize,
137  const char* truth_text, const char* box_text) {
138  auto* image_data = new ImageData();
139  image_data->imagefilename_ = name;
140  image_data->page_number_ = page_number;
141  image_data->language_ = lang;
142  // Save the imagedata.
143  image_data->image_data_.resize_no_init(imagedatasize);
144  memcpy(&image_data->image_data_[0], imagedata, imagedatasize);
145  if (!image_data->AddBoxes(box_text)) {
146  if (truth_text == nullptr || truth_text[0] == '\0') {
147  tprintf("Error: No text corresponding to page %d from image %s!\n",
148  page_number, name);
149  delete image_data;
150  return nullptr;
151  }
152  image_data->transcription_ = truth_text;
153  // If we have no boxes, the transcription is in the 0th box_texts_.
154  image_data->box_texts_.push_back(truth_text);
155  // We will create a box for the whole image on PreScale, to save unpacking
156  // the image now.
157  } else if (truth_text != nullptr && truth_text[0] != '\0' &&
158  image_data->transcription_ != truth_text) {
159  // Save the truth text as it is present and disagrees with the box text.
160  image_data->transcription_ = truth_text;
161  }
162  return image_data;
163 }
164 
165 // Writes to the given file. Returns false in case of error.
166 bool ImageData::Serialize(TFile* fp) const {
167  if (!imagefilename_.Serialize(fp)) return false;
168  if (!fp->Serialize(&page_number_)) return false;
169  if (!image_data_.Serialize(fp)) return false;
170  if (!language_.Serialize(fp)) return false;
171  if (!transcription_.Serialize(fp)) return false;
172  // WARNING: Will not work across different endian machines.
173  if (!boxes_.Serialize(fp)) return false;
174  if (!box_texts_.SerializeClasses(fp)) return false;
175  int8_t vertical = vertical_text_;
176  return fp->Serialize(&vertical);
177 }
178 
179 // Reads from the given file. Returns false in case of error.
180 // If swap is true, assumes a big/little-endian swap is needed.
182  if (!imagefilename_.DeSerialize(fp)) return false;
183  if (!fp->DeSerialize(&page_number_)) return false;
184  if (!image_data_.DeSerialize(fp)) return false;
185  if (!language_.DeSerialize(fp)) return false;
186  if (!transcription_.DeSerialize(fp)) return false;
187  // WARNING: Will not work across different endian machines.
188  if (!boxes_.DeSerialize(fp)) return false;
189  if (!box_texts_.DeSerializeClasses(fp)) return false;
190  int8_t vertical = 0;
191  if (!fp->DeSerialize(&vertical)) return false;
192  vertical_text_ = vertical != 0;
193  return true;
194 }
195 
196 // As DeSerialize, but only seeks past the data - hence a static method.
198  if (!STRING::SkipDeSerialize(fp)) return false;
199  int32_t page_number;
200  if (!fp->DeSerialize(&page_number)) return false;
201  if (!GenericVector<char>::SkipDeSerialize(fp)) return false;
202  if (!STRING::SkipDeSerialize(fp)) return false;
203  if (!STRING::SkipDeSerialize(fp)) return false;
204  if (!GenericVector<TBOX>::SkipDeSerialize(fp)) return false;
205  if (!GenericVector<STRING>::SkipDeSerializeClasses(fp)) return false;
206  int8_t vertical = 0;
207  return fp->DeSerialize(&vertical);
208 }
209 
210 // Saves the given Pix as a PNG-encoded string and destroys it.
211 // In case of missing PNG support in Leptonica use PNM format,
212 // which requires more memory.
213 void ImageData::SetPix(Pix* pix) {
214  SetPixInternal(pix, &image_data_);
215 }
216 
217 // Returns the Pix image for *this. Must be pixDestroyed after use.
218 Pix* ImageData::GetPix() const {
219  return GetPixInternal(image_data_);
220 }
221 
222 // Gets anything and everything with a non-nullptr pointer, prescaled to a
223 // given target_height (if 0, then the original image height), and aligned.
224 // Also returns (if not nullptr) the width and height of the scaled image.
225 // The return value is the scaled Pix, which must be pixDestroyed after use,
226 // and scale_factor (if not nullptr) is set to the scale factor that was applied
227 // to the image to achieve the target_height.
228 Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
229  int* scaled_width, int* scaled_height,
230  GenericVector<TBOX>* boxes) const {
231  int input_width = 0;
232  int input_height = 0;
233  Pix* src_pix = GetPix();
234  ASSERT_HOST(src_pix != nullptr);
235  input_width = pixGetWidth(src_pix);
236  input_height = pixGetHeight(src_pix);
237  if (target_height == 0) {
238  target_height = std::min(input_height, max_height);
239  }
240  float im_factor = static_cast<float>(target_height) / input_height;
241  if (scaled_width != nullptr)
242  *scaled_width = IntCastRounded(im_factor * input_width);
243  if (scaled_height != nullptr)
244  *scaled_height = target_height;
245  // Get the scaled image.
246  Pix* pix = pixScale(src_pix, im_factor, im_factor);
247  if (pix == nullptr) {
248  tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
249  input_width, input_height, im_factor);
250  }
251  if (scaled_width != nullptr) *scaled_width = pixGetWidth(pix);
252  if (scaled_height != nullptr) *scaled_height = pixGetHeight(pix);
253  pixDestroy(&src_pix);
254  if (boxes != nullptr) {
255  // Get the boxes.
256  boxes->truncate(0);
257  for (int b = 0; b < boxes_.size(); ++b) {
258  TBOX box = boxes_[b];
259  box.scale(im_factor);
260  boxes->push_back(box);
261  }
262  if (boxes->empty()) {
263  // Make a single box for the whole image.
264  TBOX box(0, 0, im_factor * input_width, target_height);
265  boxes->push_back(box);
266  }
267  }
268  if (scale_factor != nullptr) *scale_factor = im_factor;
269  return pix;
270 }
271 
273  return image_data_.size();
274 }
275 
276 // Draws the data in a new window.
277 void ImageData::Display() const {
278 #ifndef GRAPHICS_DISABLED
279  const int kTextSize = 64;
280  // Draw the image.
281  Pix* pix = GetPix();
282  if (pix == nullptr) return;
283  int width = pixGetWidth(pix);
284  int height = pixGetHeight(pix);
285  auto* win = new ScrollView("Imagedata", 100, 100,
286  2 * (width + 2 * kTextSize),
287  2 * (height + 4 * kTextSize),
288  width + 10, height + 3 * kTextSize, true);
289  win->Image(pix, 0, height - 1);
290  pixDestroy(&pix);
291  // Draw the boxes.
292  win->Pen(ScrollView::RED);
293  win->Brush(ScrollView::NONE);
294  int text_size = kTextSize;
295  if (!boxes_.empty() && boxes_[0].height() * 2 < text_size)
296  text_size = boxes_[0].height() * 2;
297  win->TextAttributes("Arial", text_size, false, false, false);
298  if (!boxes_.empty()) {
299  for (int b = 0; b < boxes_.size(); ++b) {
300  boxes_[b].plot(win);
301  win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
302  }
303  } else {
304  // The full transcription.
305  win->Pen(ScrollView::CYAN);
306  win->Text(0, height + kTextSize * 2, transcription_.string());
307  }
308  win->Update();
309  window_wait(win);
310 #endif
311 }
312 
313 // Adds the supplied boxes and transcriptions that correspond to the correct
314 // page number.
316  const GenericVector<STRING>& texts,
317  const GenericVector<int>& box_pages) {
318  // Copy the boxes and make the transcription.
319  for (int i = 0; i < box_pages.size(); ++i) {
320  if (page_number_ >= 0 && box_pages[i] != page_number_) continue;
321  transcription_ += texts[i];
322  boxes_.push_back(boxes[i]);
323  box_texts_.push_back(texts[i]);
324  }
325 }
326 
327 // Saves the given Pix as a PNG-encoded string and destroys it.
328 // In case of missing PNG support in Leptonica use PNM format,
329 // which requires more memory.
330 void ImageData::SetPixInternal(Pix* pix, GenericVector<char>* image_data) {
331  l_uint8* data;
332  size_t size;
333  l_int32 ret;
334  ret = pixWriteMem(&data, &size, pix, IFF_PNG);
335  if (ret) {
336  ret = pixWriteMem(&data, &size, pix, IFF_PNM);
337  }
338  pixDestroy(&pix);
339  image_data->resize_no_init(size);
340  memcpy(&(*image_data)[0], data, size);
341  lept_free(data);
342 }
343 
344 // Returns the Pix image for the image_data. Must be pixDestroyed after use.
345 Pix* ImageData::GetPixInternal(const GenericVector<char>& image_data) {
346  Pix* pix = nullptr;
347  if (!image_data.empty()) {
348  // Convert the array to an image.
349  const auto* u_data =
350  reinterpret_cast<const unsigned char*>(&image_data[0]);
351  pix = pixReadMem(u_data, image_data.size());
352  }
353  return pix;
354 }
355 
356 // Parses the text string as a box file and adds any discovered boxes that
357 // match the page number. Returns false on error.
358 bool ImageData::AddBoxes(const char* box_text) {
359  if (box_text != nullptr && box_text[0] != '\0') {
361  GenericVector<STRING> texts;
362  GenericVector<int> box_pages;
363  if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,
364  /*continue_on_failure*/ true, &boxes, &texts, nullptr,
365  &box_pages)) {
366  AddBoxes(boxes, texts, box_pages);
367  return true;
368  } else {
369  tprintf("Error: No boxes for page %d from image %s!\n",
370  page_number_, imagefilename_.string());
371  }
372  }
373  return false;
374 }
375 
376 // Thread function to call ReCachePages.
377 void* ReCachePagesFunc(void* data) {
378  auto* document_data = static_cast<DocumentData*>(data);
379  document_data->ReCachePages();
380  return nullptr;
381 }
382 
384  : document_name_(name),
385  pages_offset_(-1),
386  total_pages_(-1),
387  memory_used_(0),
388  max_memory_(0),
389  reader_(nullptr) {}
390 
392  SVAutoLock lock_p(&pages_mutex_);
393  SVAutoLock lock_g(&general_mutex_);
394 }
395 
396 // Reads all the pages in the given lstmf filename to the cache. The reader
397 // is used to read the file.
398 bool DocumentData::LoadDocument(const char* filename, int start_page,
399  int64_t max_memory, FileReader reader) {
400  SetDocument(filename, max_memory, reader);
401  pages_offset_ = start_page;
402  return ReCachePages();
403 }
404 
405 // Sets up the document, without actually loading it.
406 void DocumentData::SetDocument(const char* filename, int64_t max_memory,
407  FileReader reader) {
408  SVAutoLock lock_p(&pages_mutex_);
409  SVAutoLock lock(&general_mutex_);
410  document_name_ = filename;
411  pages_offset_ = -1;
412  max_memory_ = max_memory;
413  reader_ = reader;
414 }
415 
416 // Writes all the pages to the given filename. Returns false on error.
417 bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
418  SVAutoLock lock(&pages_mutex_);
419  TFile fp;
420  fp.OpenWrite(nullptr);
421  if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) {
422  tprintf("Serialize failed: %s\n", filename);
423  return false;
424  }
425  return true;
426 }
428  SVAutoLock lock(&pages_mutex_);
429  TFile fp;
430  fp.OpenWrite(buffer);
431  return pages_.Serialize(&fp);
432 }
433 
434 // Adds the given page data to this document, counting up memory.
436  SVAutoLock lock(&pages_mutex_);
437  pages_.push_back(page);
438  set_memory_used(memory_used() + page->MemoryUsed());
439 }
440 
441 // If the given index is not currently loaded, loads it using a separate
442 // thread.
444  ImageData* page = nullptr;
445  if (IsPageAvailable(index, &page)) return;
446  SVAutoLock lock(&pages_mutex_);
447  if (pages_offset_ == index) return;
448  pages_offset_ = index;
449  pages_.clear();
451 }
452 
453 // Returns a pointer to the page with the given index, modulo the total
454 // number of pages. Blocks until the background load is completed.
455 const ImageData* DocumentData::GetPage(int index) {
456  ImageData* page = nullptr;
457  while (!IsPageAvailable(index, &page)) {
458  // If there is no background load scheduled, schedule one now.
459  pages_mutex_.Lock();
460  bool needs_loading = pages_offset_ != index;
461  pages_mutex_.Unlock();
462  if (needs_loading) LoadPageInBackground(index);
463  // We can't directly load the page, or the background load will delete it
464  // while the caller is using it, so give it a chance to work.
465 #if defined(__MINGW32__)
466  sleep(1);
467 #else
468  std::this_thread::sleep_for(std::chrono::seconds(1));
469 #endif
470  }
471  return page;
472 }
473 
474 // Returns true if the requested page is available, and provides a pointer,
475 // which may be nullptr if the document is empty. May block, even though it
476 // doesn't guarantee to return true.
477 bool DocumentData::IsPageAvailable(int index, ImageData** page) {
478  SVAutoLock lock(&pages_mutex_);
479  int num_pages = NumPages();
480  if (num_pages == 0 || index < 0) {
481  *page = nullptr; // Empty Document.
482  return true;
483  }
484  if (num_pages > 0) {
485  index = Modulo(index, num_pages);
486  if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) {
487  *page = pages_[index - pages_offset_]; // Page is available already.
488  return true;
489  }
490  }
491  return false;
492 }
493 
494 // Removes all pages from memory and frees the memory, but does not forget
495 // the document metadata.
497  SVAutoLock lock(&pages_mutex_);
498  int64_t memory_saved = memory_used();
499  pages_.clear();
500  pages_offset_ = -1;
501  set_total_pages(-1);
502  set_memory_used(0);
503  tprintf("Unloaded document %s, saving %" PRId64 " memory\n",
504  document_name_.string(), memory_saved);
505  return memory_saved;
506 }
507 
508 // Shuffles all the pages in the document.
510  TRand random;
511  // Different documents get shuffled differently, but the same for the same
512  // name.
513  random.set_seed(document_name_.string());
514  int num_pages = pages_.size();
515  // Execute one random swap for each page in the document.
516  for (int i = 0; i < num_pages; ++i) {
517  int src = random.IntRand() % num_pages;
518  int dest = random.IntRand() % num_pages;
519  std::swap(pages_[src], pages_[dest]);
520  }
521 }
522 
523 // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
524 // starting at index pages_offset_.
525 bool DocumentData::ReCachePages() {
526  SVAutoLock lock(&pages_mutex_);
527  // Read the file.
528  set_total_pages(0);
529  set_memory_used(0);
530  int loaded_pages = 0;
531  pages_.truncate(0);
532  TFile fp;
533  if (!fp.Open(document_name_, reader_) ||
534  !PointerVector<ImageData>::DeSerializeSize(&fp, &loaded_pages) ||
535  loaded_pages <= 0) {
536  tprintf("Deserialize header failed: %s\n", document_name_.string());
537  return false;
538  }
539  pages_offset_ %= loaded_pages;
540  // Skip pages before the first one we want, and load the rest until max
541  // memory and skip the rest after that.
542  int page;
543  for (page = 0; page < loaded_pages; ++page) {
544  if (page < pages_offset_ ||
545  (max_memory_ > 0 && memory_used() > max_memory_)) {
547  tprintf("Deserializeskip failed\n");
548  break;
549  }
550  } else {
551  if (!pages_.DeSerializeElement(&fp)) break;
552  ImageData* image_data = pages_.back();
553  if (image_data->imagefilename().length() == 0) {
554  image_data->set_imagefilename(document_name_);
555  image_data->set_page_number(page);
556  }
557  set_memory_used(memory_used() + image_data->MemoryUsed());
558  }
559  }
560  if (page < loaded_pages) {
561  tprintf("Deserialize failed: %s read %d/%d lines\n",
562  document_name_.string(), page, loaded_pages);
563  pages_.truncate(0);
564  } else {
565  tprintf("Loaded %d/%d lines (%d-%d) of document %s\n", pages_.size(),
566  loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(),
567  document_name_.string());
568  }
569  set_total_pages(loaded_pages);
570  return !pages_.empty();
571 }
572 
573 // A collection of DocumentData that knows roughly how much memory it is using.
574 DocumentCache::DocumentCache(int64_t max_memory)
575  : num_pages_per_doc_(0), max_memory_(max_memory) {}
577 
578 // Adds all the documents in the list of filenames, counting memory.
579 // The reader is used to read the files.
581  CachingStrategy cache_strategy,
582  FileReader reader) {
583  cache_strategy_ = cache_strategy;
584  int64_t fair_share_memory = 0;
585  // In the round-robin case, each DocumentData handles restricting its content
586  // to its fair share of memory. In the sequential case, DocumentCache
587  // determines which DocumentDatas are held entirely in memory.
588  if (cache_strategy_ == CS_ROUND_ROBIN)
589  fair_share_memory = max_memory_ / filenames.size();
590  for (int arg = 0; arg < filenames.size(); ++arg) {
591  STRING filename = filenames[arg];
592  auto* document = new DocumentData(filename);
593  document->SetDocument(filename.string(), fair_share_memory, reader);
594  AddToCache(document);
595  }
596  if (!documents_.empty()) {
597  // Try to get the first page now to verify the list of filenames.
598  if (GetPageBySerial(0) != nullptr) return true;
599  tprintf("Load of page 0 failed!\n");
600  }
601  return false;
602 }
603 
604 // Adds document to the cache.
606  documents_.push_back(data);
607  return true;
608 }
609 
610 // Finds and returns a document by name.
611 DocumentData* DocumentCache::FindDocument(const STRING& document_name) const {
612  for (int i = 0; i < documents_.size(); ++i) {
613  if (documents_[i]->document_name() == document_name)
614  return documents_[i];
615  }
616  return nullptr;
617 }
618 
619 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
620 // strategy, could take a long time.
622  if (cache_strategy_ == CS_SEQUENTIAL) {
623  // In sequential mode, we assume each doc has the same number of pages
624  // whether it is true or not.
625  if (num_pages_per_doc_ == 0) GetPageSequential(0);
626  return num_pages_per_doc_ * documents_.size();
627  }
628  int total_pages = 0;
629  int num_docs = documents_.size();
630  for (int d = 0; d < num_docs; ++d) {
631  // We have to load a page to make NumPages() valid.
632  documents_[d]->GetPage(0);
633  total_pages += documents_[d]->NumPages();
634  }
635  return total_pages;
636 }
637 
638 // Returns a page by serial number, selecting them in a round-robin fashion
639 // from all the documents. Highly disk-intensive, but doesn't need samples
640 // to be shuffled between files to begin with.
641 const ImageData* DocumentCache::GetPageRoundRobin(int serial) {
642  int num_docs = documents_.size();
643  int doc_index = serial % num_docs;
644  const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs);
645  for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
646  doc_index = (serial + offset) % num_docs;
647  int page = (serial + offset) / num_docs;
648  documents_[doc_index]->LoadPageInBackground(page);
649  }
650  return doc;
651 }
652 
653 // Returns a page by serial number, selecting them in sequence from each file.
654 // Requires the samples to be shuffled between the files to give a random or
655 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
656 const ImageData* DocumentCache::GetPageSequential(int serial) {
657  int num_docs = documents_.size();
658  ASSERT_HOST(num_docs > 0);
659  if (num_pages_per_doc_ == 0) {
660  // Use the pages in the first doc as the number of pages in each doc.
661  documents_[0]->GetPage(0);
662  num_pages_per_doc_ = documents_[0]->NumPages();
663  if (num_pages_per_doc_ == 0) {
664  tprintf("First document cannot be empty!!\n");
665  ASSERT_HOST(num_pages_per_doc_ > 0);
666  }
667  // Get rid of zero now if we don't need it.
668  if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache();
669  }
670  int doc_index = serial / num_pages_per_doc_ % num_docs;
671  const ImageData* doc =
672  documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
673  // Count up total memory. Background loading makes it more complicated to
674  // keep a running count.
675  int64_t total_memory = 0;
676  for (int d = 0; d < num_docs; ++d) {
677  total_memory += documents_[d]->memory_used();
678  }
679  if (total_memory >= max_memory_) {
680  // Find something to un-cache.
681  // If there are more than 3 in front, then serial is from the back reader
682  // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
683  // we create a hole between them and then un-caching the backmost occupied
684  // will work for both.
685  int num_in_front = CountNeighbourDocs(doc_index, 1);
686  for (int offset = num_in_front - 2;
687  offset > 1 && total_memory >= max_memory_; --offset) {
688  int next_index = (doc_index + offset) % num_docs;
689  total_memory -= documents_[next_index]->UnCache();
690  }
691  // If that didn't work, the best solution is to un-cache from the back. If
692  // we take away the document that a 2nd reader is using, it will put it
693  // back and make a hole between.
694  int num_behind = CountNeighbourDocs(doc_index, -1);
695  for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
696  ++offset) {
697  int next_index = (doc_index + offset + num_docs) % num_docs;
698  total_memory -= documents_[next_index]->UnCache();
699  }
700  }
701  int next_index = (doc_index + 1) % num_docs;
702  if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
703  documents_[next_index]->LoadPageInBackground(0);
704  }
705  return doc;
706 }
707 
708 // Helper counts the number of adjacent cached neighbours of index looking in
709 // direction dir, ie index+dir, index+2*dir etc.
710 int DocumentCache::CountNeighbourDocs(int index, int dir) {
711  int num_docs = documents_.size();
712  for (int offset = dir; abs(offset) < num_docs; offset += dir) {
713  int offset_index = (index + offset + num_docs) % num_docs;
714  if (!documents_[offset_index]->IsCached()) return offset - dir;
715  }
716  return num_docs;
717 }
718 
719 } // namespace tesseract.
int MemoryUsed() const
Definition: imagedata.cpp:272
bool empty() const
Definition: genericvector.h:91
void Display() const
Definition: imagedata.cpp:277
void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:435
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void Lock()
Locks on a mutex.
Definition: svutil.cpp:64
bool IsPageAvailable(int index, ImageData **page)
Definition: imagedata.cpp:477
bool(*)(const GenericVector< char > &, const STRING &) FileWriter
Definition: serialis.h:52
void AddBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, const GenericVector< int > &box_pages)
Definition: imagedata.cpp:315
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:296
void scale(const float f)
Definition: rect.h:175
void resize_no_init(int size)
Definition: genericvector.h:66
void Unlock()
Unlocks on a mutex.
Definition: svutil.cpp:72
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.h:344
Definition: points.h:188
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
bool Serialize(FILE *fp) const
Definition: strngs.cpp:146
int32_t IntRand()
Definition: helpers.h:50
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:197
DocumentData * FindDocument(const STRING &document_name) const
Definition: imagedata.cpp:611
bool DeSerialize(TFile *fp)
Definition: imagedata.cpp:181
bool SerializeClasses(FILE *fp) const
friend void * ReCachePagesFunc(void *data)
Definition: imagedata.cpp:377
static void Draw(const GenericVector< WordFeature > &features, ScrollView *window)
Definition: imagedata.cpp:69
const char * string() const
Definition: strngs.cpp:194
bool CloseWrite(const STRING &filename, FileWriter writer)
Definition: serialis.cpp:311
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:108
void set_seed(uint64_t seed)
Definition: helpers.h:40
bool LoadDocuments(const GenericVector< STRING > &filenames, CachingStrategy cache_strategy, FileReader reader)
Definition: imagedata.cpp:580
void truncate(int size)
static void FromWordFeatures(const GenericVector< WordFeature > &word_features, GenericVector< FloatWordFeature > *float_features)
Definition: imagedata.cpp:100
static void ComputeSize(const GenericVector< WordFeature > &features, int *max_x, int *max_y)
Definition: imagedata.cpp:58
void * ReCachePagesFunc(void *data)
Definition: imagedata.cpp:377
static int SortByXBucket(const void *, const void *)
Definition: imagedata.cpp:115
void DrawTo(int x, int y)
Definition: scrollview.cpp:525
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:69
CachingStrategy
Definition: imagedata.h:42
bool DeSerialize(bool swap, FILE *fp)
bool DeSerializeClasses(bool swap, FILE *fp)
bool Serialize(FILE *fp) const
Definition: imagedata.cpp:86
bool(*)(const STRING &, GenericVector< char > *) FileReader
Definition: serialis.h:49
static bool SkipDeSerialize(TFile *fp)
Definition: imagedata.cpp:197
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:104
float y() const
Definition: points.h:210
bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:398
const ImageData * GetPage(int index)
Definition: imagedata.cpp:455
void SetDocument(const char *filename, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:406
int NumPages() const
Definition: imagedata.h:233
bool DeSerialize(bool swap, FILE *fp)
Definition: strngs.cpp:159
float x() const
Definition: points.h:207
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:185
static void StartThread(void *(*func)(void *), void *arg)
Create new thread.
Definition: svutil.cpp:81
bool AddToCache(DocumentData *data)
Definition: imagedata.cpp:605
Pix * PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, int *scaled_height, GenericVector< TBOX > *boxes) const
Definition: imagedata.cpp:228
void LoadPageInBackground(int index)
Definition: imagedata.cpp:443
const GenericVector< TBOX > & boxes() const
Definition: imagedata.h:150
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:28
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:148
static bool DeSerializeSize(TFile *fp, int32_t *size)
Definition: strngs.h:45
bool SaveToBuffer(GenericVector< char > *buffer)
Definition: imagedata.cpp:427
Definition: rect.h:34
int64_t memory_used() const
Definition: imagedata.h:240
static bool SkipDeSerialize(tesseract::TFile *fp)
Definition: strngs.cpp:179
int page_number() const
Definition: imagedata.h:132
DocumentData(const STRING &name)
Definition: imagedata.cpp:383
int Modulo(int a, int b)
Definition: helpers.h:158
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
static ImageData * Build(const char *name, int page_number, const char *lang, const char *imagedata, int imagedatasize, const char *truth_text, const char *box_text)
Definition: imagedata.cpp:135
int push_back(T object)
bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:417
int size() const
Definition: genericvector.h:72
bool DeSerialize(bool swap, FILE *fp)
Definition: imagedata.cpp:93
void SetPix(Pix *pix)
Definition: imagedata.cpp:213
const STRING & box_text(int index) const
Definition: imagedata.h:156
Pix * GetPix() const
Definition: imagedata.cpp:218
const int kMaxReadAhead
Definition: imagedata.cpp:44
bool Serialize(FILE *fp) const
#define ASSERT_HOST(x)
Definition: errcode.h:88
const GenericVector< char > & image_data() const
Definition: imagedata.h:138
int IntCastRounded(double x)
Definition: helpers.h:175
void SetCursor(int x, int y)
Definition: scrollview.cpp:519
DocumentCache(int64_t max_memory)
Definition: imagedata.cpp:574
bool Serialize(TFile *fp) const
Definition: imagedata.cpp:166
static bool DeSerializeSkip(TFile *fp)