tesseract  4.1.1
tesseract::Classify Class Reference

#include <classify.h>

Inheritance diagram for tesseract::Classify:
tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Wordrec tesseract::Tesseract

Public Member Functions

 Classify ()
 
 ~Classify () override
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
 ~CCStruct () override
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
 ~CUtil () override
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Static Public Member Functions

static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 

Public Attributes

bool allow_blob_division = true
 
bool prioritize_division = false
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = true
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = false
 
bool matcher_debug_separate_windows = false
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
bool EnableLearning
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_ambigs_for_adaption = false
 

Protected Attributes

IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 

Detailed Description

Definition at line 103 of file classify.h.

Constructor & Destructor Documentation

◆ Classify()

tesseract::Classify::Classify ( )

Definition at line 60 of file classify.cpp.

61  : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
62  this->params()),
64  "Prioritize blob division over chopping", this->params()),
65  BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
66  this->params()),
67  INT_MEMBER(classify_debug_level, 0, "Classify debug level",
68  this->params()),
69  INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
70  this->params()),
72  "Character Normalization Range ...", this->params()),
74  "Veto ratio between classifier ratings", this->params()),
76  "Veto difference between classifier certainties",
77  this->params()),
78  BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
79  this->params()),
80  BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
81  this->params()),
83  "Enable adaptive classifier", this->params()),
85  "Use pre-adapted classifier templates", this->params()),
87  "Save adapted templates to a file", this->params()),
88  BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
89  this->params()),
91  "Non-linear stroke-density normalization", this->params()),
92  INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
93  INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
94  INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
95  this->params()),
96  double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
97  this->params()),
98  double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
99  this->params()),
100  double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
101  this->params()),
102  double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
103  this->params()),
104  double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
105  this->params()),
106  double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
107  this->params()),
108  INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
109  this->params()),
111  "Reliable Config Threshold", this->params()),
113  "Enable adaption even if the ambiguities have not been seen",
114  this->params()),
116  "Maximum angle delta for prototype clustering",
117  this->params()),
119  "Penalty to apply when a non-alnum is vertically out of "
120  "its expected textline position",
121  this->params()),
122  double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
123  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
124  this->params()),
126  "Scale factor for features not used", this->params()),
129  "Prune poor adapted results this much worse than best result",
130  this->params()),
132  "Threshold at which classify_adapted_pruning_factor starts",
133  this->params()),
135  "Threshold for good protos during adaptive 0-255",
136  this->params()),
138  "Threshold for good features during adaptive 0-255",
139  this->params()),
141  "Do not include character fragments in the"
142  " results of the classifier",
143  this->params()),
145  -3.0,
146  "Exclude fragments that do not look like whole"
147  " characters from training and adaption",
148  this->params()),
150  "Bring up graphical debugging windows for fragments training",
151  this->params()),
153  "Use two different windows for debugging the matching: "
154  "One for the protos and one for the features.",
155  this->params()),
156  STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
157  this->params()),
159  "Class Pruner Threshold 0-255", this->params()),
161  "Class Pruner Multiplier 0-255: ", this->params()),
163  "Class Pruner CutoffStrength: ", this->params()),
165  "Integer Matcher Multiplier 0-255: ", this->params()),
167  "Assume the input is numbers [0-9].", this->params()),
168  double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
169  this->params()),
171  "Penalty to add to worst rating for noise", this->params()),
173  shape_table_(nullptr),
174  dict_(this),
175  static_classifier_(nullptr) {
176  fontinfo_table_.set_compare_callback(
178  fontinfo_table_.set_clear_callback(
180  fontset_table_.set_compare_callback(
182  fontset_table_.set_clear_callback(
184  AdaptedTemplates = nullptr;
185  BackupAdaptedTemplates = nullptr;
186  PreTrainedTemplates = nullptr;
187  AllProtosOn = nullptr;
188  AllConfigsOn = nullptr;
189  AllConfigsOff = nullptr;
190  TempProtoMask = nullptr;
191  NormProtos = nullptr;
192 
193  NumAdaptationsFailed = 0;
194 
195  learn_debug_win_ = nullptr;
196  learn_fragmented_word_debug_win_ = nullptr;
197  learn_fragments_debug_win_ = nullptr;
199 }
bool classify_enable_adaptive_matcher
Definition: classify.h:445
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
ShapeTable * shape_table_
Definition: classify.h:546
bool classify_debug_character_fragments
Definition: classify.h:491
int classify_adapt_feature_threshold
Definition: classify.h:483
BIT_VECTOR TempProtoMask
Definition: classify.h:525
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
bool classify_enable_learning
Definition: classify.h:429
double speckle_large_max_size
Definition: classify.h:509
double matcher_clustering_max_angle_delta
Definition: classify.h:468
bool classify_nonlinear_norm
Definition: classify.h:452
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
int classify_adapt_proto_threshold
Definition: classify.h:481
int classify_class_pruner_threshold
Definition: classify.h:499
int classify_cp_cutoff_strength
Definition: classify.h:503
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
double speckle_rating_penalty
Definition: classify.h:511
double classify_max_certainty_margin
Definition: classify.h:440
double matcher_rating_margin
Definition: classify.h:460
int classify_class_pruner_multiplier
Definition: classify.h:501
BIT_VECTOR AllConfigsOff
Definition: classify.h:524
double classify_misfit_junk_penalty
Definition: classify.h:471
double matcher_reliable_adaptive_result
Definition: classify.h:457
double matcher_good_threshold
Definition: classify.h:456
double matcher_avg_noise_size
Definition: classify.h:461
bool classify_save_adapted_templates
Definition: classify.h:449
bool allow_blob_division
Definition: classify.h:423
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
double classify_char_norm_range
Definition: classify.h:436
IntegerMatcher im_
Definition: classify.h:540
void FontSetDeleteCallback(FontSet fs)
Definition: fontinfo.cpp:147
bool CompareFontSet(const FontSet &fs1, const FontSet &fs2)
Definition: fontinfo.cpp:127
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:519
BIT_VECTOR AllProtosOn
Definition: classify.h:522
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:541
double certainty_scale
Definition: classify.h:473
double classify_max_rating_ratio
Definition: classify.h:438
char * classify_learn_debug_str
Definition: classify.h:495
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:321
bool classify_enable_adaptive_debugger
Definition: classify.h:450
double tessedit_class_miss_scale
Definition: classify.h:475
int classify_learning_debug_level
Definition: classify.h:455
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
ParamsVectors * params()
Definition: ccutil.h:67
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
int matcher_permanent_classes_min
Definition: classify.h:462
bool classify_bln_numeric_mode
Definition: classify.h:508
void FontInfoDeleteCallback(FontInfo f)
Definition: fontinfo.cpp:138
bool classify_use_pre_adapted_templates
Definition: classify.h:447
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:489
bool matcher_debug_separate_windows
Definition: classify.h:494
int classify_integer_matcher_multiplier
Definition: classify.h:505
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
double matcher_perfect_threshold
Definition: classify.h:458
bool prioritize_division
Definition: classify.h:428
bool disable_character_fragments
Definition: classify.h:486
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:466
double classify_adapted_pruning_threshold
Definition: classify.h:479
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
double matcher_bad_match_pad
Definition: classify.h:459
double classify_adapted_pruning_factor
Definition: classify.h:477
int matcher_min_examples_for_prototyping
Definition: classify.h:464
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:119
NORM_PROTOS * NormProtos
Definition: classify.h:527

◆ ~Classify()

tesseract::Classify::~Classify ( )
override

Definition at line 201 of file classify.cpp.

201  {
203  delete learn_debug_win_;
204  delete learn_fragmented_word_debug_win_;
205  delete learn_fragments_debug_win_;
206 }
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:459

Member Function Documentation

◆ AdaptableWord()

bool tesseract::Classify::AdaptableWord ( WERD_RES word)

Return true if the specified word is acceptable for adaptation.

Globals: none

Parameters
wordcurrent word
Returns
true or false

Definition at line 821 of file adaptmatch.cpp.

821  {
822  if (word->best_choice == nullptr) return false;
823  int BestChoiceLength = word->best_choice->length();
824  float adaptable_score =
826  return // rules that apply in general - simplest to compute first
827  BestChoiceLength > 0 &&
828  BestChoiceLength == word->rebuild_word->NumBlobs() &&
829  BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
830  // This basically ensures that the word is at least a dictionary match
831  // (freq word, user word, system dawg word, etc).
832  // Since all the other adjustments will make adjust factor higher
833  // than higher than adaptable_score=1.1+0.05=1.15
834  // Since these are other flags that ensure that the word is dict word,
835  // this check could be at times redundant.
836  word->best_choice->adjust_factor() <= adaptable_score &&
837  // Make sure that alternative choices are not dictionary words.
838  word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
839 }
int NumBlobs() const
Definition: blobs.h:448
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:82
float adjust_factor() const
Definition: ratngs.h:296
int length() const
Definition: ratngs.h:293
TWERD * rebuild_word
Definition: pageres.h:266
double segment_penalty_dict_case_ok
Definition: dict.h:605
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:439
virtual Dict & getDict()
Definition: classify.h:107
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:80
WERD_CHOICE * best_choice
Definition: pageres.h:241

◆ AdaptiveClassifier()

void tesseract::Classify::AdaptiveClassifier ( TBLOB Blob,
BLOB_CHOICE_LIST *  Choices 
)

This routine calls the adaptive matcher which returns (in an array) the class id of each class matched.

It also returns the number of classes matched. For each class matched it places the best rating found for that class into the Ratings array.

Bad matches are then removed so that they don't need to be sorted. The remaining good matches are then sorted and converted to choices.

This routine also performs some simple speckle filtering.

Parameters
Blobblob to be classified
[out]ChoicesList of choices found by adaptive matcher. filled on return with the choices found by the class pruner and the ratings therefrom. Also contains the detailed results of the integer matcher.

Definition at line 191 of file adaptmatch.cpp.

191  {
192  assert(Choices != nullptr);
193  auto *Results = new ADAPT_RESULTS;
194  Results->Initialize();
195 
196  ASSERT_HOST(AdaptedTemplates != nullptr);
197 
198  DoAdaptiveMatch(Blob, Results);
199 
200  RemoveBadMatches(Results);
201  Results->match.sort(&UnicharRating::SortDescendingRating);
202  RemoveExtraPuncs(Results);
203  Results->ComputeBest();
204  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
205  Choices);
206 
207  // TODO(rays) Move to before ConvertMatchesToChoices!
208  if (LargeSpeckle(*Blob) || Choices->length() == 0)
209  AddLargeSpeckleTo(Results->BlobLength, Choices);
210 
211  if (matcher_debug_level >= 1) {
212  tprintf("AD Matches = ");
213  PrintAdaptiveMatchResults(*Results);
214  }
215 
216 #ifndef GRAPHICS_DISABLED
218  DebugAdaptiveClassifier(Blob, Results);
219 #endif
220 
221  delete Results;
222 } /* AdaptiveClassifier */
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:219
void Initialize()
Definition: adaptmatch.cpp:102
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
void RemoveBadMatches(ADAPT_RESULTS *Results)
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:242
const DENORM & denorm() const
Definition: blobs.h:363
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
bool classify_enable_adaptive_debugger
Definition: classify.h:450
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:55
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
TBOX bounding_box() const
Definition: blobs.cpp:468
#define ASSERT_HOST(x)
Definition: errcode.h:88

◆ AdaptiveClassifierIsEmpty()

bool tesseract::Classify::AdaptiveClassifierIsEmpty ( ) const
inline

Definition at line 326 of file classify.h.

326  {
327  return AdaptedTemplates->NumPermClasses == 0;
328  }
uint8_t NumPermClasses
Definition: adaptive.h:69
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515

◆ AdaptiveClassifierIsFull()

bool tesseract::Classify::AdaptiveClassifierIsFull ( ) const
inline

Definition at line 325 of file classify.h.

325 { return NumAdaptationsFailed > 0; }

◆ AdaptToChar()

void tesseract::Classify::AdaptToChar ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
float  Threshold,
ADAPT_TEMPLATES  adaptive_templates 
)
Parameters
Blobblob to add to templates for ClassId
ClassIdclass to add blob to
FontinfoIdfont information from pre-trained templates
Thresholdminimum match rating to existing template
adaptive_templatescurrent set of adapted templates

Globals:

  • AllProtosOn dummy mask to match against all protos
  • AllConfigsOn dummy mask to match against all configs

Definition at line 853 of file adaptmatch.cpp.

855  {
856  int NumFeatures;
857  INT_FEATURE_ARRAY IntFeatures;
858  UnicharRating int_result;
859  INT_CLASS IClass;
860  ADAPT_CLASS Class;
861  TEMP_CONFIG TempConfig;
862  FEATURE_SET FloatFeatures;
863  int NewTempConfigId;
864 
865  if (!LegalClassId (ClassId))
866  return;
867 
868  int_result.unichar_id = ClassId;
869  Class = adaptive_templates->Class[ClassId];
870  assert(Class != nullptr);
871  if (IsEmptyAdaptedClass(Class)) {
872  InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
873  } else {
874  IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
875 
876  NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
877  if (NumFeatures <= 0) {
878  return; // Features already freed by GetAdaptiveFeatures.
879  }
880 
881  // Only match configs with the matching font.
882  BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
883  for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
884  if (GetFontinfoId(Class, cfg) == FontinfoId) {
885  SET_BIT(MatchingFontConfigs, cfg);
886  } else {
887  reset_bit(MatchingFontConfigs, cfg);
888  }
889  }
890  im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
891  NumFeatures, IntFeatures,
894  FreeBitVector(MatchingFontConfigs);
895 
896  SetAdaptiveThreshold(Threshold);
897 
898  if (1.0f - int_result.rating <= Threshold) {
899  if (ConfigIsPermanent(Class, int_result.config)) {
901  tprintf("Found good match to perm config %d = %4.1f%%.\n",
902  int_result.config, int_result.rating * 100.0);
903  FreeFeatureSet(FloatFeatures);
904  return;
905  }
906 
907  TempConfig = TempConfigFor(Class, int_result.config);
908  IncreaseConfidence(TempConfig);
909  if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
910  Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
911  }
913  tprintf("Increasing reliability of temp config %d to %d.\n",
914  int_result.config, TempConfig->NumTimesSeen);
915 
916  if (TempConfigReliable(ClassId, TempConfig)) {
917  MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
918  UpdateAmbigsGroup(ClassId, Blob);
919  }
920  } else {
922  tprintf("Found poor match to temp config %d = %4.1f%%.\n",
923  int_result.config, int_result.rating * 100.0);
925  DisplayAdaptedChar(Blob, IClass);
926  }
927  NewTempConfigId =
928  MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId,
929  NumFeatures, IntFeatures, FloatFeatures);
930  if (NewTempConfigId >= 0 &&
931  TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
932  MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
933  UpdateAmbigsGroup(ClassId, Blob);
934  }
935 
936 #ifndef GRAPHICS_DISABLED
938  DisplayAdaptedChar(Blob, IClass);
939  }
940 #endif
941  }
942  FreeFeatureSet(FloatFeatures);
943  }
944 } /* AdaptToChar */
#define ClassForClassId(T, c)
Definition: intproto.h:178
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
int classify_adapt_feature_threshold
Definition: classify.h:483
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:79
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
#define reset_bit(array, bit)
Definition: bitvec.h:57
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:693
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:173
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:946
INT_TEMPLATES Templates
Definition: adaptive.h:67
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:152
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:62
IntegerMatcher im_
Definition: classify.h:540
void SetAdaptiveThreshold(float Threshold)
#define SET_BIT(array, bit)
Definition: bitvec.h:55
BIT_VECTOR AllProtosOn
Definition: classify.h:522
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
uint8_t NumTimesSeen
Definition: adaptive.h:36
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:82
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:511
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:95
uint8_t MaxNumTimesSeen
Definition: adaptive.h:57
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:786
int classify_learning_debug_level
Definition: classify.h:455
#define MAX_NUM_PROTOS
Definition: intproto.h:48
uint8_t NumConfigs
Definition: intproto.h:108
#define LegalClassId(c)
Definition: intproto.h:176
bool matcher_debug_separate_windows
Definition: classify.h:494
#define NO_DEBUG
Definition: adaptmatch.cpp:79
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
uint32_t * BIT_VECTOR
Definition: bitvec.h:28

◆ AddLargeSpeckleTo()

void tesseract::Classify::AddLargeSpeckleTo ( int  blob_length,
BLOB_CHOICE_LIST *  choices 
)

Definition at line 219 of file classify.cpp.

219  {
220  BLOB_CHOICE_IT bc_it(choices);
221  // If there is no classifier result, we will use the worst possible certainty
222  // and corresponding rating.
223  float certainty = -getDict().certainty_scale;
224  float rating = rating_scale * blob_length;
225  if (!choices->empty() && blob_length > 0) {
226  bc_it.move_to_last();
227  BLOB_CHOICE* worst_choice = bc_it.data();
228  // Add speckle_rating_penalty to worst rating, matching old value.
229  rating = worst_choice->rating() + speckle_rating_penalty;
230  // Compute the rating to correspond to the certainty. (Used to be kept
231  // the same, but that messes up the language model search.)
232  certainty = -rating * getDict().certainty_scale /
233  (rating_scale * blob_length);
234  }
235  auto* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
236  -1, 0.0f, FLT_MAX, 0,
238  bc_it.add_to_end(blob_choice);
239 }
double certainty_scale
Definition: dict.h:627
double speckle_rating_penalty
Definition: classify.h:511
float rating() const
Definition: ratngs.h:80
virtual Dict & getDict()
Definition: classify.h:107

◆ AddNewResult()

void tesseract::Classify::AddNewResult ( const UnicharRating new_result,
ADAPT_RESULTS results 
)

This routine adds the result of a classification into Results. If the new rating is much worse than the current best rating, it is not entered into results because it would end up being stripped later anyway. If the new rating is better than the old rating for the class, it replaces the old rating. If this is the first rating for the class, the class is added to the list of matched classes in Results. If the new rating is better than the best so far, it becomes the best so far.

Globals:

Parameters
new_resultnew result to add
[out]resultsresults to add new result to

Definition at line 994 of file adaptmatch.cpp.

995  {
996  int old_match = FindScoredUnichar(new_result.unichar_id, *results);
997 
998  if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
999  (old_match < results->match.size() &&
1000  new_result.rating <= results->match[old_match].rating))
1001  return; // New one not good enough.
1002 
1003  if (!unicharset.get_fragment(new_result.unichar_id))
1004  results->HasNonfragment = true;
1005 
1006  if (old_match < results->match.size()) {
1007  results->match[old_match].rating = new_result.rating;
1008  } else {
1009  results->match.push_back(new_result);
1010  }
1011 
1012  if (new_result.rating > results->best_rating &&
1013  // Ensure that fragments do not affect best rating, class and config.
1014  // This is needed so that at least one non-fragmented character is
1015  // always present in the results.
1016  // TODO(daria): verify that this helps accuracy and does not
1017  // hurt performance.
1018  !unicharset.get_fragment(new_result.unichar_id)) {
1019  results->best_match_index = old_match;
1020  results->best_rating = new_result.rating;
1021  results->best_unichar_id = new_result.unichar_id;
1022  }
1023 } /* AddNewResult */
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:94
int best_match_index
Definition: adaptmatch.cpp:95
float best_rating
Definition: adaptmatch.cpp:96
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
UNICHARSET unicharset
Definition: ccutil.h:73
int push_back(T object)
bool HasNonfragment
Definition: adaptmatch.cpp:93

◆ AmbigClassifier()

void tesseract::Classify::AmbigClassifier ( const GenericVector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
const TBLOB blob,
INT_TEMPLATES  templates,
ADAPT_CLASS classes,
UNICHAR_ID ambiguities,
ADAPT_RESULTS results 
)

This routine is identical to CharNormClassifier() except that it does no class pruning. It simply matches the unknown blob against the classes listed in Ambiguities.

Globals:

Parameters
blobblob to be classified
templatesbuilt-in templates to classify against
classesadapted class templates
ambiguitiesarray of unichar id's to match against
[out]resultsplace to put match results
int_features
fx_info

Definition at line 1045 of file adaptmatch.cpp.

1052  {
1053  if (int_features.empty()) return;
1054  auto* CharNormArray = new uint8_t[unicharset.size()];
1055  UnicharRating int_result;
1056 
1057  results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr,
1058  CharNormArray);
1059  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1060  if (debug)
1061  tprintf("AM Matches = ");
1062 
1063  int top = blob->bounding_box().top();
1064  int bottom = blob->bounding_box().bottom();
1065  while (*ambiguities >= 0) {
1066  CLASS_ID class_id = *ambiguities;
1067 
1068  int_result.unichar_id = class_id;
1069  im_.Match(ClassForClassId(templates, class_id),
1071  int_features.size(), &int_features[0],
1072  &int_result,
1075 
1076  ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0,
1077  results->BlobLength,
1079  CharNormArray, &int_result, results);
1080  ambiguities++;
1081  }
1082  delete [] CharNormArray;
1083 } /* AmbigClassifier */
bool empty() const
Definition: genericvector.h:91
#define ClassForClassId(T, c)
Definition: intproto.h:178
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int size() const
Definition: unicharset.h:341
int classify_adapt_feature_threshold
Definition: classify.h:483
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
int32_t BlobLength
Definition: adaptmatch.cpp:92
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
IntegerMatcher im_
Definition: classify.h:540
BIT_VECTOR AllProtosOn
Definition: classify.h:522
int16_t bottom() const
Definition: rect.h:65
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:511
bool matcher_debug_separate_windows
Definition: classify.h:494
int classify_integer_matcher_multiplier
Definition: classify.h:505
UNICHARSET unicharset
Definition: ccutil.h:73
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: blobs.cpp:468
#define NO_DEBUG
Definition: adaptmatch.cpp:79
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
int16_t top() const
Definition: rect.h:58

◆ BaselineClassifier()

UNICHAR_ID * tesseract::Classify::BaselineClassifier ( TBLOB Blob,
const GenericVector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
ADAPT_TEMPLATES  Templates,
ADAPT_RESULTS Results 
)

This routine extracts baseline normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Globals:

  • BaselineCutoffs expected num features for each class
Parameters
Blobblob to be classified
Templatescurrent set of adapted templates
Resultsplace to put match results
int_features
fx_info
Returns
Array of possible ambiguous chars that should be checked.

Definition at line 1265 of file adaptmatch.cpp.

1268  {
1269  if (int_features.empty()) return nullptr;
1270  auto* CharNormArray = new uint8_t[unicharset.size()];
1271  ClearCharNormArray(CharNormArray);
1272 
1274  PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
1275  CharNormArray, BaselineCutoffs, &Results->CPResults);
1276 
1277  if (matcher_debug_level >= 2 || classify_debug_level > 1)
1278  tprintf("BL Matches = ");
1279 
1280  MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
1281  CharNormArray,
1282  Templates->Class, matcher_debug_flags, 0,
1283  Blob->bounding_box(), Results->CPResults, Results);
1284 
1285  delete [] CharNormArray;
1286  CLASS_ID ClassId = Results->best_unichar_id;
1287  if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
1288  return nullptr;
1289 
1290  return Templates->Class[ClassId]->
1291  Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
1292 } /* BaselineClassifier */
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:44
bool empty() const
Definition: genericvector.h:91
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
const double kStandardFeatureLength
Definition: intfx.h:46
int size() const
Definition: unicharset.h:341
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:94
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
int best_match_index
Definition: adaptmatch.cpp:95
GenericVector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:98
INT_TEMPLATES Templates
Definition: adaptive.h:67
int32_t BlobLength
Definition: adaptmatch.cpp:92
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:452
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
int32_t Length
Definition: intfx.h:36
UNICHARSET unicharset
Definition: ccutil.h:73
CLUSTERCONFIG Config
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: blobs.cpp:468
int IntCastRounded(double x)
Definition: helpers.h:175

◆ CharNormClassifier()

int tesseract::Classify::CharNormClassifier ( TBLOB blob,
const TrainingSample sample,
ADAPT_RESULTS adapt_results 
)

This routine extracts character normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Parameters
blobblob to be classified
sampletemplates to classify unknown against
adapt_resultsplace to put match results

Globals:

  • CharNormCutoffs expected num features for each class
  • AllProtosOn mask that enables all protos
  • AllConfigsOn mask that enables all configs

Definition at line 1311 of file adaptmatch.cpp.

1313  {
1314  // This is the length that is used for scaling ratings vs certainty.
1315  adapt_results->BlobLength =
1316  IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1317  GenericVector<UnicharRating> unichar_results;
1318  static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
1319  -1, &unichar_results);
1320  // Convert results to the format used internally by AdaptiveClassifier.
1321  for (int r = 0; r < unichar_results.size(); ++r) {
1322  AddNewResult(unichar_results[r], adapt_results);
1323  }
1324  return sample.num_features();
1325 } /* CharNormClassifier */
const double kStandardFeatureLength
Definition: intfx.h:46
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:994
Pix * pix() const
Definition: normalis.h:246
int32_t BlobLength
Definition: adaptmatch.cpp:92
const DENORM & denorm() const
Definition: blobs.h:363
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)
Definition: cluster.h:32
int IntCastRounded(double x)
Definition: helpers.h:175

◆ CharNormTrainingSample()

int tesseract::Classify::CharNormTrainingSample ( bool  pruner_only,
int  keep_this,
const TrainingSample sample,
GenericVector< UnicharRating > *  results 
)

Definition at line 1329 of file adaptmatch.cpp.

1332  {
1333  results->clear();
1334  auto* adapt_results = new ADAPT_RESULTS();
1335  adapt_results->Initialize();
1336  // Compute the bounding box of the features.
1337  uint32_t num_features = sample.num_features();
1338  // Only the top and bottom of the blob_box are used by MasterMatcher, so
1339  // fabricate right and left using top and bottom.
1340  TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1341  sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1342  // Compute the char_norm_array from the saved cn_feature.
1343  FEATURE norm_feature = sample.GetCNFeature();
1344  auto* char_norm_array = new uint8_t[unicharset.size()];
1345  int num_pruner_classes = std::max(unicharset.size(),
1347  auto* pruner_norm_array = new uint8_t[num_pruner_classes];
1348  adapt_results->BlobLength =
1349  static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
1350  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
1351  pruner_norm_array);
1352 
1353  PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
1354  pruner_norm_array,
1355  shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1356  &adapt_results->CPResults);
1357  delete [] pruner_norm_array;
1358  if (keep_this >= 0) {
1359  adapt_results->CPResults[0].Class = keep_this;
1360  adapt_results->CPResults.truncate(1);
1361  }
1362  if (pruner_only) {
1363  // Convert pruner results to output format.
1364  for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
1365  int class_id = adapt_results->CPResults[i].Class;
1366  results->push_back(
1367  UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
1368  }
1369  } else {
1370  MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
1371  char_norm_array,
1372  nullptr, matcher_debug_flags,
1374  blob_box, adapt_results->CPResults, adapt_results);
1375  // Convert master matcher results to output format.
1376  for (int i = 0; i < adapt_results->match.size(); i++) {
1377  results->push_back(adapt_results->match[i]);
1378  }
1380  }
1381  delete [] char_norm_array;
1382  delete adapt_results;
1383  return num_features;
1384 } /* CharNormTrainingSample */
ShapeTable * shape_table_
Definition: classify.h:546
int size() const
Definition: unicharset.h:341
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
float ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:32
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:452
Definition: rect.h:34
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:55
int classify_integer_matcher_multiplier
Definition: classify.h:505
UNICHARSET unicharset
Definition: ccutil.h:73
int push_back(T object)
Definition: cluster.h:32

◆ ClassAndConfigIDToFontOrShapeID()

int tesseract::Classify::ClassAndConfigIDToFontOrShapeID ( int  class_id,
int  int_result_config 
) const

Definition at line 2207 of file adaptmatch.cpp.

2208  {
2209  int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2210  // Older inttemps have no font_ids.
2211  if (font_set_id < 0)
2212  return kBlankFontinfoId;
2213  const FontSet &fs = fontset_table_.get(font_set_id);
2214  ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
2215  return fs.configs[int_result_config];
2216 }
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
#define ASSERT_HOST(x)
Definition: errcode.h:88

◆ ClassIDToDebugStr()

STRING tesseract::Classify::ClassIDToDebugStr ( const INT_TEMPLATES_STRUCT templates,
int  class_id,
int  config_id 
) const

Definition at line 2194 of file adaptmatch.cpp.

2195  {
2196  STRING class_string;
2197  if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2198  int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2199  class_string = shape_table_->DebugStr(shape_id);
2200  } else {
2201  class_string = unicharset.debug_str(class_id);
2202  }
2203  return class_string;
2204 }
ShapeTable * shape_table_
Definition: classify.h:546
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
Definition: strngs.h:45
UNICHARSET unicharset
Definition: ccutil.h:73
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const

◆ ClassifyAsNoise()

void tesseract::Classify::ClassifyAsNoise ( ADAPT_RESULTS results)

This routine computes a rating which reflects the likelihood that the blob being classified is a noise blob. NOTE: assumes that the blob length has already been computed and placed into Results.

Parameters
resultsresults to add noise classification to

Globals:

  • matcher_avg_noise_size avg. length of a noise blob

Definition at line 1399 of file adaptmatch.cpp.

1399  {
1400  float rating = results->BlobLength / matcher_avg_noise_size;
1401  rating *= rating;
1402  rating /= 1.0 + rating;
1403 
1404  AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1405 } /* ClassifyAsNoise */
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:994
double matcher_avg_noise_size
Definition: classify.h:461
int32_t BlobLength
Definition: adaptmatch.cpp:92

◆ ClearCharNormArray()

void tesseract::Classify::ClearCharNormArray ( uint8_t *  char_norm_array)

For each class in the unicharset, clears the corresponding entry in char_norm_array. char_norm_array is indexed by unichar_id.

Globals:

  • none
Parameters
char_norm_arrayarray to be cleared

Definition at line 44 of file float2int.cpp.

44  {
45  memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
46 } /* ClearCharNormArray */
int size() const
Definition: unicharset.h:341
UNICHARSET unicharset
Definition: ccutil.h:73

◆ ComputeCharNormArrays()

void tesseract::Classify::ComputeCharNormArrays ( FEATURE_STRUCT norm_feature,
INT_TEMPLATES_STRUCT templates,
uint8_t *  char_norm_array,
uint8_t *  pruner_array 
)

Definition at line 1698 of file adaptmatch.cpp.

1701  {
1702  ComputeIntCharNormArray(*norm_feature, char_norm_array);
1703  if (pruner_array != nullptr) {
1704  if (shape_table_ == nullptr) {
1705  ComputeIntCharNormArray(*norm_feature, pruner_array);
1706  } else {
1707  memset(pruner_array, UINT8_MAX,
1708  templates->NumClasses * sizeof(pruner_array[0]));
1709  // Each entry in the pruner norm array is the MIN of all the entries of
1710  // the corresponding unichars in the CharNormArray.
1711  for (int id = 0; id < templates->NumClasses; ++id) {
1712  int font_set_id = templates->Class[id]->font_set_id;
1713  const FontSet &fs = fontset_table_.get(font_set_id);
1714  for (int config = 0; config < fs.size; ++config) {
1715  const Shape& shape = shape_table_->GetShape(fs.configs[config]);
1716  for (int c = 0; c < shape.size(); ++c) {
1717  if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
1718  pruner_array[id] = char_norm_array[shape[c].unichar_id];
1719  }
1720  }
1721  }
1722  }
1723  }
1724  FreeFeature(norm_feature);
1725 }
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
ShapeTable * shape_table_
Definition: classify.h:546
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:62
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
void FreeFeature(FEATURE Feature)
Definition: ocrfeatures.cpp:54

◆ ComputeCorrectedRating()

double tesseract::Classify::ComputeCorrectedRating ( bool  debug,
int  unichar_id,
double  cp_rating,
double  im_rating,
int  feature_misses,
int  bottom,
int  top,
int  blob_length,
int  matcher_multiplier,
const uint8_t *  cn_factors 
)

Definition at line 1202 of file adaptmatch.cpp.

1207  {
1208  // Compute class feature corrections.
1209  double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
1210  cn_factors[unichar_id],
1211  matcher_multiplier);
1212  double miss_penalty = tessedit_class_miss_scale * feature_misses;
1213  double vertical_penalty = 0.0;
1214  // Penalize non-alnums for being vertical misfits.
1215  if (!unicharset.get_isalpha(unichar_id) &&
1216  !unicharset.get_isdigit(unichar_id) &&
1217  cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1218  int min_bottom, max_bottom, min_top, max_top;
1219  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
1220  &min_top, &max_top);
1221  if (debug) {
1222  tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
1223  top, min_top, max_top, bottom, min_bottom, max_bottom);
1224  }
1225  if (top < min_top || top > max_top ||
1226  bottom < min_bottom || bottom > max_bottom) {
1227  vertical_penalty = classify_misfit_junk_penalty;
1228  }
1229  }
1230  double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1231  if (result < WORST_POSSIBLE_RATING)
1232  result = WORST_POSSIBLE_RATING;
1233  if (debug) {
1234  tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1235  unicharset.id_to_unichar(unichar_id),
1236  result * 100.0,
1237  cp_rating * 100.0,
1238  (1.0 - im_rating) * 100.0,
1239  (cn_corrected - (1.0 - im_rating)) * 100.0,
1240  cn_factors[unichar_id],
1241  miss_penalty * 100.0,
1242  vertical_penalty * 100.0);
1243  }
1244  return result;
1245 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:86
double classify_misfit_junk_penalty
Definition: classify.h:471
IntegerMatcher im_
Definition: classify.h:540
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)
double tessedit_class_miss_scale
Definition: classify.h:475
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
UNICHARSET unicharset
Definition: ccutil.h:73

◆ ComputeIntCharNormArray()

void tesseract::Classify::ComputeIntCharNormArray ( const FEATURE_STRUCT norm_feature,
uint8_t *  char_norm_array 
)

For each class in unicharset, computes the match between norm_feature and the normalization protos for that class. Converts this number to the range from 0 - 255 and stores it into char_norm_array. CharNormArray is indexed by unichar_id.

Globals:

  • PreTrainedTemplates current set of built-in templates
Parameters
norm_featurecharacter normalization feature
[out]char_norm_arrayplace to put results of size unicharset.size()

Definition at line 62 of file float2int.cpp.

63  {
64  for (int i = 0; i < unicharset.size(); i++) {
65  if (i < PreTrainedTemplates->NumClasses) {
66  int norm_adjust = static_cast<int>(INT_CHAR_NORM_RANGE *
67  ComputeNormMatch(i, norm_feature, false));
68  char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
69  } else {
70  // Classes with no templates (eg. ambigs & ligatures) default
71  // to worst match.
72  char_norm_array[i] = MAX_INT_CHAR_NORM;
73  }
74  }
75 } /* ComputeIntCharNormArray */
int size() const
Definition: unicharset.h:341
#define INT_CHAR_NORM_RANGE
Definition: intproto.h:130
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:108
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:94
UNICHARSET unicharset
Definition: ccutil.h:73
#define MAX_INT_CHAR_NORM
Definition: float2int.cpp:27

◆ ComputeIntFeatures()

void tesseract::Classify::ComputeIntFeatures ( FEATURE_SET  Features,
INT_FEATURE_ARRAY  IntFeatures 
)

This routine converts each floating point pico-feature in Features into integer format and saves it into IntFeatures.

Globals:

  • none
Parameters
Featuresfloating point pico-features to be converted
[out]IntFeaturesarray to put converted features into

Definition at line 90 of file float2int.cpp.

91  {
92  float YShift;
93 
95  YShift = BASELINE_Y_SHIFT;
96  else
97  YShift = Y_SHIFT;
98 
99  for (int Fid = 0; Fid < Features->NumFeatures; Fid++) {
100  FEATURE Feature = Features->Features[Fid];
101 
102  IntFeatures[Fid].X =
104  IntFeatures[Fid].Y =
105  Bucket8For(Feature->Params[PicoFeatY], YShift, INT_FEAT_RANGE);
106  IntFeatures[Fid].Theta = CircBucketFor(Feature->Params[PicoFeatDir],
108  IntFeatures[Fid].CP_misses = 0;
109  }
110 } /* ComputeIntFeatures */
#define Y_SHIFT
Definition: intproto.h:42
uint8_t Bucket8For(float param, float offset, int num_buckets)
Definition: intproto.cpp:418
#define X_SHIFT
Definition: intproto.h:41
FEATURE Features[1]
Definition: ocrfeatures.h:68
#define ANGLE_SHIFT
Definition: intproto.h:40
#define BASELINE_Y_SHIFT
Definition: float2int.h:28
float Params[1]
Definition: ocrfeatures.h:61
uint16_t NumFeatures
Definition: ocrfeatures.h:66
#define INT_FEAT_RANGE
Definition: float2int.h:27
uint8_t CircBucketFor(float param, float offset, int num_buckets)
Definition: intproto.cpp:432

◆ ComputeNormMatch()

float tesseract::Classify::ComputeNormMatch ( CLASS_ID  ClassId,
const FEATURE_STRUCT feature,
bool  DebugMatch 
)

This routine compares Features against each character normalization proto for ClassId and returns the match rating of the best match.

Parameters
ClassIdid of class to match against
featurecharacter normalization feature
DebugMatchcontrols dump of debug info

Globals: NormProtos character normalization prototypes

Returns
Best match rating for Feature against protos of ClassId.

Definition at line 94 of file normmatch.cpp.

96  {
97  LIST Protos;
98  float BestMatch;
99  float Match;
100  float Delta;
101  PROTOTYPE *Proto;
102  int ProtoId;
103 
104  if (ClassId >= NormProtos->NumProtos) {
105  ClassId = NO_CLASS;
106  }
107 
108  /* handle requests for classification as noise */
109  if (ClassId == NO_CLASS) {
110  /* kludge - clean up constants and make into control knobs later */
111  Match = (feature.Params[CharNormLength] *
112  feature.Params[CharNormLength] * 500.0 +
113  feature.Params[CharNormRx] *
114  feature.Params[CharNormRx] * 8000.0 +
115  feature.Params[CharNormRy] *
116  feature.Params[CharNormRy] * 8000.0);
117  return (1.0 - NormEvidenceOf(Match));
118  }
119 
120  BestMatch = FLT_MAX;
121  Protos = NormProtos->Protos[ClassId];
122 
123  if (DebugMatch) {
124  tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
125  }
126 
127  ProtoId = 0;
128  iterate(Protos) {
129  Proto = reinterpret_cast<PROTOTYPE *>first_node (Protos);
130  Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
131  Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
132  if (DebugMatch) {
133  tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
134  Proto->Mean[CharNormY], Delta,
135  Proto->Weight.Elliptical[CharNormY], Match);
136  }
137  Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
138  Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
139  if (DebugMatch) {
140  tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
141  Proto->Mean[CharNormRx], Delta,
142  Proto->Weight.Elliptical[CharNormRx], Match);
143  }
144  // Ry is width! See intfx.cpp.
145  Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
146  if (DebugMatch) {
147  tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
148  Proto->Mean[CharNormRy], Delta,
149  Proto->Weight.Elliptical[CharNormRy]);
150  }
151  Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
152  Delta *= kWidthErrorWeighting;
153  Match += Delta;
154  if (DebugMatch) {
155  tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
156  Match, Match / classify_norm_adj_midpoint,
157  NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
158  }
159 
160  if (Match < BestMatch)
161  BestMatch = Match;
162 
163  ProtoId++;
164  }
165  return 1.0 - NormEvidenceOf(BestMatch);
166 } /* ComputeNormMatch */
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define iterate(l)
Definition: oldlist.h:101
float * Mean
Definition: cluster.h:74
FLOATUNION Weight
Definition: cluster.h:79
double classify_norm_adj_midpoint
Definition: normmatch.cpp:71
#define NO_CLASS
Definition: matchdefs.h:35
float Params[1]
Definition: ocrfeatures.h:61
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
float * Elliptical
Definition: cluster.h:60
UNICHARSET unicharset
Definition: ccutil.h:73
#define first_node(l)
Definition: oldlist.h:92
NORM_PROTOS * NormProtos
Definition: classify.h:527
const double kWidthErrorWeighting
Definition: normmatch.cpp:74
LIST * Protos
Definition: normmatch.cpp:38

◆ ConvertMatchesToChoices()

void tesseract::Classify::ConvertMatchesToChoices ( const DENORM denorm,
const TBOX box,
ADAPT_RESULTS Results,
BLOB_CHOICE_LIST *  Choices 
)

The function converts the given match ratings to the list of blob choices with ratings and certainties (used by the context checkers). If character fragments are present in the results, this function also makes sure that there is at least one non-fragmented classification included. For each classification result check the unicharset for "definite" ambiguities and modify the resulting Choices accordingly.

Definition at line 1413 of file adaptmatch.cpp.

1415  {
1416  assert(Choices != nullptr);
1417  float Rating;
1418  float Certainty;
1419  BLOB_CHOICE_IT temp_it;
1420  bool contains_nonfrag = false;
1421  temp_it.set_to_list(Choices);
1422  int choices_length = 0;
1423  // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1424  // number of returned results, but with a shape_table_ we want to have room
1425  // for at least the biggest shape (which might contain hundreds of Indic
1426  // grapheme fragments) and more, so use double the size of the biggest shape
1427  // if that is more than the default.
1428  int max_matches = MAX_MATCHES;
1429  if (shape_table_ != nullptr) {
1430  max_matches = shape_table_->MaxNumUnichars() * 2;
1431  if (max_matches < MAX_MATCHES)
1432  max_matches = MAX_MATCHES;
1433  }
1434 
1435  float best_certainty = -FLT_MAX;
1436  for (int i = 0; i < Results->match.size(); i++) {
1437  const UnicharRating& result = Results->match[i];
1438  bool adapted = result.adapted;
1439  bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1440  if (temp_it.length()+1 == max_matches &&
1441  !contains_nonfrag && current_is_frag) {
1442  continue; // look for a non-fragmented character to fill the
1443  // last spot in Choices if only fragments are present
1444  }
1445  // BlobLength can never be legally 0, this means recognition failed.
1446  // But we must return a classification result because some invoking
1447  // functions (chopper/permuter) do not anticipate a null blob choice.
1448  // So we need to assign a poor, but not infinitely bad score.
1449  if (Results->BlobLength == 0) {
1450  Certainty = -20;
1451  Rating = 100; // should be -certainty * real_blob_length
1452  } else {
1453  Rating = Certainty = (1.0f - result.rating);
1454  Rating *= rating_scale * Results->BlobLength;
1455  Certainty *= -(getDict().certainty_scale);
1456  }
1457  // Adapted results, by their very nature, should have good certainty.
1458  // Those that don't are at best misleading, and often lead to errors,
1459  // so don't accept adapted results that are too far behind the best result,
1460  // whether adapted or static.
1461  // TODO(rays) find some way of automatically tuning these constants.
1462  if (Certainty > best_certainty) {
1463  best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1464  } else if (adapted &&
1465  Certainty / classify_adapted_pruning_factor < best_certainty) {
1466  continue; // Don't accept bad adapted results.
1467  }
1468 
1469  float min_xheight, max_xheight, yshift;
1470  denorm.XHeightRange(result.unichar_id, unicharset, box,
1471  &min_xheight, &max_xheight, &yshift);
1472  auto* choice =
1473  new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
1475  min_xheight, max_xheight, yshift,
1476  adapted ? BCC_ADAPTED_CLASSIFIER
1478  choice->set_fonts(result.fonts);
1479  temp_it.add_to_end(choice);
1480  contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1481  choices_length++;
1482  if (choices_length >= max_matches) break;
1483  }
1484  Results->match.truncate(choices_length);
1485 } // ConvertMatchesToChoices
int MaxNumUnichars() const
Definition: shapetable.cpp:455
double certainty_scale
Definition: dict.h:627
GenericVector< ScoredFont > fonts
Definition: shapetable.h:87
ShapeTable * shape_table_
Definition: classify.h:546
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
int32_t BlobLength
Definition: adaptmatch.cpp:92
void truncate(int size)
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:96
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
virtual Dict & getDict()
Definition: classify.h:107
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
UNICHARSET unicharset
Definition: ccutil.h:73
int size() const
Definition: genericvector.h:72
double classify_adapted_pruning_threshold
Definition: classify.h:479
double classify_adapted_pruning_factor
Definition: classify.h:477
#define MAX_MATCHES
Definition: adaptmatch.cpp:77
void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht, float *max_xht, float *yshift) const
Definition: normalis.cpp:428

◆ ConvertProto()

void tesseract::Classify::ConvertProto ( PROTO  Proto,
int  ProtoId,
INT_CLASS  Class 
)

This routine converts Proto to integer format and installs it as ProtoId in Class.

Parameters
Protofloating-pt proto to be converted to integer format
ProtoIdid of proto
Classinteger class to add converted proto to

Definition at line 488 of file intproto.cpp.

488  {
489  INT_PROTO P;
490  float Param;
491 
492  assert(ProtoId < Class->NumProtos);
493 
494  P = ProtoForProtoId(Class, ProtoId);
495 
496  Param = Proto->A * 128;
497  P->A = TruncateParam(Param, -128, 127, nullptr);
498 
499  Param = -Proto->B * 256;
500  P->B = TruncateParam(Param, 0, 255, nullptr);
501 
502  Param = Proto->C * 128;
503  P->C = TruncateParam(Param, -128, 127, nullptr);
504 
505  Param = Proto->Angle * 256;
506  if (Param < 0 || Param >= 256)
507  P->Angle = 0;
508  else
509  P->Angle = static_cast<uint8_t>(Param);
510 
511  /* round proto length to nearest integer number of pico-features */
512  Param = (Proto->Length / GetPicoFeatureLength()) + 0.5;
513  Class->ProtoLengths[ProtoId] = TruncateParam(Param, 1, 255, nullptr);
515  cprintf("Converted ffeat to (A=%d,B=%d,C=%d,L=%d)",
516  P->A, P->B, P->C, Class->ProtoLengths[ProtoId]);
517 } /* ConvertProto */
float A
Definition: protos.h:37
int TruncateParam(float Param, int Min, int Max, char *Id)
Definition: intproto.cpp:1701
uint8_t * ProtoLengths
Definition: intproto.h:110
uint8_t Angle
Definition: intproto.h:85
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
#define GetPicoFeatureLength()
Definition: picofeat.h:57
float B
Definition: protos.h:38
float Length
Definition: protos.h:43
int classify_learning_debug_level
Definition: classify.h:455
float C
Definition: protos.h:39
float Angle
Definition: protos.h:42
#define ProtoForProtoId(C, P)
Definition: intproto.h:168

◆ CreateIntTemplates()

INT_TEMPLATES tesseract::Classify::CreateIntTemplates ( CLASSES  FloatProtos,
const UNICHARSET target_unicharset 
)

This routine converts from the old floating point format to the new integer format.

Parameters
FloatProtosprototypes in old floating pt format
target_unicharsetthe UNICHARSET to use
Returns
New set of training templates in integer format.
Note
Globals: none

Definition at line 527 of file intproto.cpp.

529  {
530  INT_TEMPLATES IntTemplates;
531  CLASS_TYPE FClass;
532  INT_CLASS IClass;
533  int ClassId;
534  int ProtoId;
535  int ConfigId;
536 
537  IntTemplates = NewIntTemplates();
538 
539  for (ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
540  FClass = &(FloatProtos[ClassId]);
541  if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
542  strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
543  cprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
544  target_unicharset.id_to_unichar(ClassId));
545  }
546  assert(UnusedClassIdIn(IntTemplates, ClassId));
547  IClass = NewIntClass(FClass->NumProtos, FClass->NumConfigs);
548  FontSet fs;
549  fs.size = FClass->font_set.size();
550  fs.configs = new int[fs.size];
551  for (int i = 0; i < fs.size; ++i) {
552  fs.configs[i] = FClass->font_set.get(i);
553  }
554  if (this->fontset_table_.contains(fs)) {
555  IClass->font_set_id = this->fontset_table_.get_id(fs);
556  delete[] fs.configs;
557  } else {
558  IClass->font_set_id = this->fontset_table_.push_back(fs);
559  }
560  AddIntClass(IntTemplates, ClassId, IClass);
561 
562  for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
563  AddIntProto(IClass);
564  ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
565  AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
567  AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
568  }
569 
570  for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
571  AddIntConfig(IClass);
572  ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
573  }
574  }
575  return (IntTemplates);
576 } /* CreateIntTemplates */
int size() const
Return the size used.
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:282
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:261
void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class)
Definition: intproto.cpp:231
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:367
int size() const
Definition: unicharset.h:341
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:463
UnicityTableEqEq< int > font_set
Definition: protos.h:61
INT_TEMPLATES NewIntTemplates()
Definition: intproto.cpp:682
int16_t NumConfigs
Definition: protos.h:58
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:488
const T & get(int id) const
Return the object from an id.
#define ProtoIn(Class, Pid)
Definition: protos.h:84
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs)
Definition: intproto.cpp:626
void AddProtoToClassPruner(PROTO Proto, CLASS_ID ClassId, INT_TEMPLATES Templates)
Definition: intproto.cpp:328
int16_t NumProtos
Definition: protos.h:55
#define UnusedClassIdIn(T, c)
Definition: intproto.h:177
int classify_learning_debug_level
Definition: classify.h:455
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
CONFIGS Configurations
Definition: protos.h:60

◆ DebugAdaptiveClassifier()

void tesseract::Classify::DebugAdaptiveClassifier ( TBLOB blob,
ADAPT_RESULTS Results 
)
Parameters
blobblob whose classification is being debugged
Resultsresults of match being debugged

Globals: none

Definition at line 1497 of file adaptmatch.cpp.

1498  {
1499  if (static_classifier_ == nullptr) return;
1500  INT_FX_RESULT_STRUCT fx_info;
1502  TrainingSample* sample =
1503  BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1504  if (sample == nullptr) return;
1505  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
1506  Results->best_unichar_id);
1507 } /* DebugAdaptiveClassifier */
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:94
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)
Pix * pix() const
Definition: normalis.h:246
const DENORM & denorm() const
Definition: blobs.h:363
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:77
Definition: cluster.h:32

◆ DisplayAdaptedChar()

void tesseract::Classify::DisplayAdaptedChar ( TBLOB blob,
INT_CLASS_STRUCT int_class 
)

Definition at line 946 of file adaptmatch.cpp.

946  {
947 #ifndef GRAPHICS_DISABLED
948  INT_FX_RESULT_STRUCT fx_info;
950  TrainingSample* sample =
952  &bl_features);
953  if (sample == nullptr) return;
954 
955  UnicharRating int_result;
956  im_.Match(int_class, AllProtosOn, AllConfigsOn,
957  bl_features.size(), &bl_features[0],
960  tprintf("Best match to temp config %d = %4.1f%%.\n",
961  int_result.config, int_result.rating * 100.0);
963  uint32_t ConfigMask;
964  ConfigMask = 1 << int_result.config;
966  im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask),
967  bl_features.size(), &bl_features[0],
971  }
972 
973  delete sample;
974 #endif
975 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int classify_adapt_feature_threshold
Definition: classify.h:483
bool classify_nonlinear_norm
Definition: classify.h:452
IntegerMatcher im_
Definition: classify.h:540
void UpdateMatchDisplay()
Definition: intproto.cpp:447
BIT_VECTOR AllProtosOn
Definition: classify.h:522
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:511
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:77
int classify_learning_debug_level
Definition: classify.h:455
bool matcher_debug_separate_windows
Definition: classify.h:494
int size() const
Definition: genericvector.h:72
#define NO_DEBUG
Definition: adaptmatch.cpp:79
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
Definition: cluster.h:32

◆ DoAdaptiveMatch()

void tesseract::Classify::DoAdaptiveMatch ( TBLOB Blob,
ADAPT_RESULTS Results 
)

This routine performs an adaptive classification. If we have not yet adapted to enough classes, a simple classification to the pre-trained templates is performed. Otherwise, we match the blob against the adapted templates. If the adapted templates do not match well, we try a match against the pre-trained templates. If an adapted template match is found, we do a match to any pre-trained templates which could be ambiguous. The results from all of these classifications are merged together into Results.

Parameters
Blobblob to be classified
Resultsplace to put match results

Globals:

  • PreTrainedTemplates built-in training templates
  • AdaptedTemplates templates adapted for this page
  • matcher_reliable_adaptive_result rating limit for a great match

Definition at line 1530 of file adaptmatch.cpp.

1530  {
1531  UNICHAR_ID *Ambiguities;
1532 
1533  INT_FX_RESULT_STRUCT fx_info;
1535  TrainingSample* sample =
1537  &bl_features);
1538  if (sample == nullptr) return;
1539 
1540  // TODO: With LSTM, static_classifier_ is nullptr.
1541  // Return to avoid crash in CharNormClassifier.
1542  if (static_classifier_ == nullptr) {
1543  delete sample;
1544  return;
1545  }
1546 
1548  tess_cn_matching) {
1549  CharNormClassifier(Blob, *sample, Results);
1550  } else {
1551  Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
1552  AdaptedTemplates, Results);
1553  if ((!Results->match.empty() &&
1554  MarginalMatch(Results->best_rating,
1556  !tess_bn_matching) ||
1557  Results->match.empty()) {
1558  CharNormClassifier(Blob, *sample, Results);
1559  } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1560  AmbigClassifier(bl_features, fx_info, Blob,
1563  Ambiguities,
1564  Results);
1565  }
1566  }
1567 
1568  // Force the blob to be classified as noise
1569  // if the results contain only fragments.
1570  // TODO(daria): verify that this is better than
1571  // just adding a nullptr classification.
1572  if (!Results->HasNonfragment || Results->match.empty())
1573  ClassifyAsNoise(Results);
1574  delete sample;
1575 } /* DoAdaptiveMatch */
int UNICHAR_ID
Definition: unichar.h:34
bool empty() const
Definition: genericvector.h:91
bool classify_nonlinear_norm
Definition: classify.h:452
uint8_t NumPermClasses
Definition: adaptive.h:69
double matcher_reliable_adaptive_result
Definition: classify.h:457
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:131
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
float best_rating
Definition: adaptmatch.cpp:96
void ClassifyAsNoise(ADAPT_RESULTS *Results)
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:77
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int matcher_permanent_classes_min
Definition: classify.h:462
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
Definition: cluster.h:32
bool HasNonfragment
Definition: adaptmatch.cpp:93

◆ EndAdaptiveClassifier()

void tesseract::Classify::EndAdaptiveClassifier ( )

This routine performs cleanup operations on the adaptive classifier. It should be called before the program is terminated. Its main function is to save the adapted templates to a file.

Globals:

Definition at line 459 of file adaptmatch.cpp.

459  {
460  STRING Filename;
461  FILE *File;
462 
463  if (AdaptedTemplates != nullptr &&
465  Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
466  File = fopen (Filename.string(), "wb");
467  if (File == nullptr)
468  cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
469  else {
470  cprintf ("\nSaving adapted templates to %s ...", Filename.string());
471  fflush(stdout);
473  cprintf ("\n");
474  fclose(File);
475  }
476  }
477 
478  if (AdaptedTemplates != nullptr) {
480  AdaptedTemplates = nullptr;
481  }
482  if (BackupAdaptedTemplates != nullptr) {
484  BackupAdaptedTemplates = nullptr;
485  }
486 
487  if (PreTrainedTemplates != nullptr) {
489  PreTrainedTemplates = nullptr;
490  }
492  FreeNormProtos();
493  if (AllProtosOn != nullptr) {
494  FreeBitVector(AllProtosOn);
495  FreeBitVector(AllConfigsOn);
496  FreeBitVector(AllConfigsOff);
497  FreeBitVector(TempProtoMask);
498  AllProtosOn = nullptr;
499  AllConfigsOn = nullptr;
500  AllConfigsOff = nullptr;
501  TempProtoMask = nullptr;
502  }
503  delete shape_table_;
504  shape_table_ = nullptr;
505  delete static_classifier_;
506  static_classifier_ = nullptr;
507 } /* EndAdaptiveClassifier */
bool classify_enable_adaptive_matcher
Definition: classify.h:445
ShapeTable * shape_table_
Definition: classify.h:546
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:182
BIT_VECTOR TempProtoMask
Definition: classify.h:525
BIT_VECTOR AllConfigsOff
Definition: classify.h:524
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:698
const char * string() const
Definition: strngs.cpp:194
bool classify_save_adapted_templates
Definition: classify.h:449
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:519
BIT_VECTOR AllProtosOn
Definition: classify.h:522
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:453
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
virtual Dict & getDict()
Definition: classify.h:107
Definition: strngs.h:45
void EndDangerousAmbigs()
Definition: stopper.cpp:360
STRING imagefile
Definition: ccutil.h:77
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:75

◆ ExpandShapesAndApplyCorrections()

void tesseract::Classify::ExpandShapesAndApplyCorrections ( ADAPT_CLASS classes,
bool  debug,
int  class_id,
int  bottom,
int  top,
float  cp_rating,
int  blob_length,
int  matcher_multiplier,
const uint8_t *  cn_factors,
UnicharRating int_result,
ADAPT_RESULTS final_results 
)

Definition at line 1128 of file adaptmatch.cpp.

1132  {
1133  if (classes != nullptr) {
1134  // Adapted result. Convert configs to fontinfo_ids.
1135  int_result->adapted = true;
1136  for (int f = 0; f < int_result->fonts.size(); ++f) {
1137  int_result->fonts[f].fontinfo_id =
1138  GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
1139  }
1140  } else {
1141  // Pre-trained result. Map fonts using font_sets_.
1142  int_result->adapted = false;
1143  for (int f = 0; f < int_result->fonts.size(); ++f) {
1144  int_result->fonts[f].fontinfo_id =
1146  int_result->fonts[f].fontinfo_id);
1147  }
1148  if (shape_table_ != nullptr) {
1149  // Two possible cases:
1150  // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1151  // int_result->fonts are the same. In this case build a new vector of
1152  // mapped fonts and replace the fonts in int_result.
1153  // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1154  // by int_result. In this case, build a vector of UnicharRating to
1155  // gather together different font-ids for each unichar. Also covers case1.
1156  GenericVector<UnicharRating> mapped_results;
1157  for (int f = 0; f < int_result->fonts.size(); ++f) {
1158  int shape_id = int_result->fonts[f].fontinfo_id;
1159  const Shape& shape = shape_table_->GetShape(shape_id);
1160  for (int c = 0; c < shape.size(); ++c) {
1161  int unichar_id = shape[c].unichar_id;
1162  if (!unicharset.get_enabled(unichar_id)) continue;
1163  // Find the mapped_result for unichar_id.
1164  int r = 0;
1165  for (r = 0; r < mapped_results.size() &&
1166  mapped_results[r].unichar_id != unichar_id; ++r) {}
1167  if (r == mapped_results.size()) {
1168  mapped_results.push_back(*int_result);
1169  mapped_results[r].unichar_id = unichar_id;
1170  mapped_results[r].fonts.truncate(0);
1171  }
1172  for (int i = 0; i < shape[c].font_ids.size(); ++i) {
1173  mapped_results[r].fonts.push_back(
1174  ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
1175  }
1176  }
1177  }
1178  for (int m = 0; m < mapped_results.size(); ++m) {
1179  mapped_results[m].rating =
1180  ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
1181  cp_rating, int_result->rating,
1182  int_result->feature_misses, bottom, top,
1183  blob_length, matcher_multiplier, cn_factors);
1184  AddNewResult(mapped_results[m], final_results);
1185  }
1186  return;
1187  }
1188  }
1189  if (unicharset.get_enabled(class_id)) {
1190  int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
1191  int_result->rating,
1192  int_result->feature_misses,
1193  bottom, top, blob_length,
1194  matcher_multiplier, cn_factors);
1195  AddNewResult(*int_result, final_results);
1196  }
1197 }
GenericVector< ScoredFont > fonts
Definition: shapetable.h:87
ShapeTable * shape_table_
Definition: classify.h:546
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:994
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:173
void truncate(int size)
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:878
UNICHARSET unicharset
Definition: ccutil.h:73
int push_back(T object)
int size() const
Definition: genericvector.h:72
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const

◆ ExtractFeatures()

void tesseract::Classify::ExtractFeatures ( const TBLOB blob,
bool  nonlinear_norm,
GenericVector< INT_FEATURE_STRUCT > *  bl_features,
GenericVector< INT_FEATURE_STRUCT > *  cn_features,
INT_FX_RESULT_STRUCT results,
GenericVector< int > *  outline_cn_counts 
)
static

Definition at line 442 of file intfx.cpp.

447  {
448  DENORM bl_denorm, cn_denorm;
449  tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm,
450  &bl_denorm, &cn_denorm, results);
451  if (outline_cn_counts != nullptr)
452  outline_cn_counts->truncate(0);
453  // Iterate the outlines.
454  for (TESSLINE* ol = blob.outlines; ol != nullptr; ol = ol->next) {
455  // Iterate the polygon.
456  EDGEPT* loop_pt = ol->FindBestStartPt();
457  EDGEPT* pt = loop_pt;
458  if (pt == nullptr) continue;
459  do {
460  if (pt->IsHidden()) continue;
461  // Find a run of equal src_outline.
462  EDGEPT* last_pt = pt;
463  do {
464  last_pt = last_pt->next;
465  } while (last_pt != loop_pt && !last_pt->IsHidden() &&
466  last_pt->src_outline == pt->src_outline);
467  last_pt = last_pt->prev;
468  // Until the adaptive classifier can be weaned off polygon segments,
469  // we have to force extraction from the polygon for the bl_features.
470  ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength,
471  true, bl_features);
472  ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength,
473  false, cn_features);
474  pt = last_pt;
475  } while ((pt = pt->next) != loop_pt);
476  if (outline_cn_counts != nullptr)
477  outline_cn_counts->push_back(cn_features->size());
478  }
479  results->NumBL = bl_features->size();
480  results->NumCN = cn_features->size();
481  results->YBottom = blob.bounding_box().bottom();
482  results->YTop = blob.bounding_box().top();
483  results->Width = blob.bounding_box().width();
484 }
int16_t width() const
Definition: rect.h:115
const double kStandardFeatureLength
Definition: intfx.h:46
Definition: blobs.h:99
uint8_t YTop
Definition: intfx.h:42
EDGEPT * next
Definition: blobs.h:192
TESSLINE * outlines
Definition: blobs.h:400
bool IsHidden() const
Definition: blobs.h:176
int16_t NumCN
Definition: intfx.h:39
int16_t NumBL
Definition: intfx.h:39
void truncate(int size)
TESSLINE * next
Definition: blobs.h:281
uint8_t YBottom
Definition: intfx.h:41
int16_t bottom() const
Definition: rect.h:65
int push_back(T object)
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: blobs.cpp:468
C_OUTLINE * src_outline
Definition: blobs.h:194
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:129
EDGEPT * prev
Definition: blobs.h:193
int16_t Width
Definition: intfx.h:40
int16_t top() const
Definition: rect.h:58

◆ ExtractIntCNFeatures()

FEATURE_SET tesseract::Classify::ExtractIntCNFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters
blobblob to extract features from
fx_info
Returns
Integer character-normalized features for blob.

Definition at line 217 of file picofeat.cpp.

218  {
219  INT_FX_RESULT_STRUCT local_fx_info(fx_info);
222  blob, false, &local_fx_info, &bl_features);
223  if (sample == nullptr) return nullptr;
224 
225  uint32_t num_features = sample->num_features();
226  const INT_FEATURE_STRUCT* features = sample->features();
227  FEATURE_SET feature_set = NewFeatureSet(num_features);
228  for (uint32_t f = 0; f < num_features; ++f) {
229  FEATURE feature = NewFeature(&IntFeatDesc);
230 
231  feature->Params[IntX] = features[f].X;
232  feature->Params[IntY] = features[f].Y;
233  feature->Params[IntDir] = features[f].Theta;
234  AddFeature(feature_set, feature);
235  }
236  delete sample;
237 
238  return feature_set;
239 } /* ExtractIntCNFeatures */
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:78
Definition: picofeat.h:31
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:94
const FEATURE_DESC_STRUCT IntFeatDesc
bool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature)
Definition: ocrfeatures.cpp:40
Definition: picofeat.h:30
float Params[1]
Definition: ocrfeatures.h:61
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:77
Definition: cluster.h:32

◆ ExtractIntGeoFeatures()

FEATURE_SET tesseract::Classify::ExtractIntGeoFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters
blobblob to extract features from
fx_info
Returns
Geometric (top/bottom/width) features for blob.

Definition at line 247 of file picofeat.cpp.

248  {
249  INT_FX_RESULT_STRUCT local_fx_info(fx_info);
252  blob, false, &local_fx_info, &bl_features);
253  if (sample == nullptr) return nullptr;
254 
255  FEATURE_SET feature_set = NewFeatureSet(1);
256  FEATURE feature = NewFeature(&IntFeatDesc);
257 
258  feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
259  feature->Params[GeoTop] = sample->geo_feature(GeoTop);
260  feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
261  AddFeature(feature_set, feature);
262  delete sample;
263 
264  return feature_set;
265 } /* ExtractIntGeoFeatures */
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:78
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:94
const FEATURE_DESC_STRUCT IntFeatDesc
bool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature)
Definition: ocrfeatures.cpp:40
float Params[1]
Definition: ocrfeatures.h:61
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:77
Definition: cluster.h:32

◆ ExtractOutlineFeatures()

FEATURE_SET tesseract::Classify::ExtractOutlineFeatures ( TBLOB Blob)

Convert each segment in the outline to a feature and return the features.

Parameters
Blobblob to extract pico-features from
Returns
Outline-features for Blob.
Note
Globals: none

Definition at line 41 of file outfeat.cpp.

41  {
42  LIST Outlines;
43  LIST RemainingOutlines;
44  MFOUTLINE Outline;
45  FEATURE_SET FeatureSet;
46  float XScale, YScale;
47 
48  FeatureSet = NewFeatureSet (MAX_OUTLINE_FEATURES);
49  if (Blob == nullptr)
50  return (FeatureSet);
51 
52  Outlines = ConvertBlob (Blob);
53 
54  NormalizeOutlines(Outlines, &XScale, &YScale);
55  RemainingOutlines = Outlines;
56  iterate(RemainingOutlines) {
57  Outline = static_cast<MFOUTLINE>first_node (RemainingOutlines);
58  ConvertToOutlineFeatures(Outline, FeatureSet);
59  }
61  NormalizeOutlineX(FeatureSet);
62  FreeOutlines(Outlines);
63  return (FeatureSet);
64 } /* ExtractOutlineFeatures */
#define iterate(l)
Definition: oldlist.h:101
void NormalizeOutlineX(FEATURE_SET FeatureSet)
Definition: outfeat.cpp:145
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:94
#define MAX_OUTLINE_FEATURES
Definition: outfeat.h:35
void ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet)
Definition: outfeat.cpp:107
#define first_node(l)
Definition: oldlist.h:92
void FreeOutlines(LIST Outlines)
Definition: mfoutline.cpp:167
LIST ConvertBlob(TBLOB *blob)
Definition: mfoutline.cpp:37
void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale)
Definition: mfoutline.cpp:276

◆ ExtractPicoFeatures()

FEATURE_SET tesseract::Classify::ExtractPicoFeatures ( TBLOB Blob)

Operation: Dummy for now.

Globals:

  • classify_norm_method normalization method currently specified
    Parameters
    Blobblob to extract pico-features from
    Returns
    Pico-features for Blob.

Definition at line 63 of file picofeat.cpp.

63  {
64  LIST Outlines;
65  LIST RemainingOutlines;
66  MFOUTLINE Outline;
67  FEATURE_SET FeatureSet;
68  float XScale, YScale;
69 
70  FeatureSet = NewFeatureSet(MAX_PICO_FEATURES);
71  Outlines = ConvertBlob(Blob);
72  NormalizeOutlines(Outlines, &XScale, &YScale);
73  RemainingOutlines = Outlines;
74  iterate(RemainingOutlines) {
75  Outline = static_cast<MFOUTLINE>first_node (RemainingOutlines);
76  ConvertToPicoFeatures2(Outline, FeatureSet);
77  }
79  NormalizePicoX(FeatureSet);
80  FreeOutlines(Outlines);
81  return (FeatureSet);
82 
83 } /* ExtractPicoFeatures */
#define iterate(l)
Definition: oldlist.h:101
void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet)
Definition: picofeat.cpp:155
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:94
#define MAX_PICO_FEATURES
Definition: picofeat.h:46
void NormalizePicoX(FEATURE_SET FeatureSet)
Definition: picofeat.cpp:193
#define first_node(l)
Definition: oldlist.h:92
void FreeOutlines(LIST Outlines)
Definition: mfoutline.cpp:167
LIST ConvertBlob(TBLOB *blob)
Definition: mfoutline.cpp:37
void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale)
Definition: mfoutline.cpp:276

◆ FreeNormProtos()

void tesseract::Classify::FreeNormProtos ( )

Definition at line 168 of file normmatch.cpp.

168  {
169  if (NormProtos != nullptr) {
170  for (int i = 0; i < NormProtos->NumProtos; i++)
174  Efree(NormProtos);
175  NormProtos = nullptr;
176  }
177 }
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:37
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
void Efree(void *ptr)
Definition: emalloc.cpp:45
NORM_PROTOS * NormProtos
Definition: classify.h:527
LIST * Protos
Definition: normmatch.cpp:38

◆ get_fontinfo_table() [1/2]

UnicityTable<FontInfo>& tesseract::Classify::get_fontinfo_table ( )
inline

Definition at line 386 of file classify.h.

386  {
387  return fontinfo_table_;
388  }
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529

◆ get_fontinfo_table() [2/2]

const UnicityTable<FontInfo>& tesseract::Classify::get_fontinfo_table ( ) const
inline

Definition at line 389 of file classify.h.

389  {
390  return fontinfo_table_;
391  }
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529

◆ get_fontset_table()

UnicityTable<FontSet>& tesseract::Classify::get_fontset_table ( )
inline

Definition at line 392 of file classify.h.

392  {
393  return fontset_table_;
394  }
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537

◆ GetAdaptiveFeatures()

int tesseract::Classify::GetAdaptiveFeatures ( TBLOB Blob,
INT_FEATURE_ARRAY  IntFeatures,
FEATURE_SET FloatFeatures 
)

This routine sets up the feature extractor to extract baseline normalized pico-features.

The extracted pico-features are converted to integer form and placed in IntFeatures. The original floating-pt. features are returned in FloatFeatures.

Globals: none

Parameters
Blobblob to extract features from
[out]IntFeaturesarray to fill with integer features
[out]FloatFeaturesplace to return actual floating-pt features
Returns
Number of pico-features returned (0 if an error occurred)

Definition at line 786 of file adaptmatch.cpp.

788  {
789  FEATURE_SET Features;
790  int NumFeatures;
791 
792  classify_norm_method.set_value(baseline);
793  Features = ExtractPicoFeatures(Blob);
794 
795  NumFeatures = Features->NumFeatures;
796  if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
797  FreeFeatureSet(Features);
798  return 0;
799  }
800 
801  ComputeIntFeatures(Features, IntFeatures);
802  *FloatFeatures = Features;
803 
804  return NumFeatures;
805 } /* GetAdaptiveFeatures */
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:78
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:62
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:63
uint16_t NumFeatures
Definition: ocrfeatures.h:66
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:90

◆ GetAmbiguities()

UNICHAR_ID * tesseract::Classify::GetAmbiguities ( TBLOB Blob,
CLASS_ID  CorrectClass 
)

This routine matches blob to the built-in templates to find out if there are any classes other than the correct class which are potential ambiguities.

Parameters
Blobblob to get classification ambiguities for
CorrectClasscorrect class for Blob

Globals:

  • CurrentRatings used by qsort compare routine
  • PreTrainedTemplates built-in templates
Returns
String containing all possible ambiguous classes.

Definition at line 1592 of file adaptmatch.cpp.

1593  {
1594  auto *Results = new ADAPT_RESULTS();
1595  UNICHAR_ID *Ambiguities;
1596  int i;
1597 
1598  Results->Initialize();
1599  INT_FX_RESULT_STRUCT fx_info;
1601  TrainingSample* sample =
1603  &bl_features);
1604  if (sample == nullptr) {
1605  delete Results;
1606  return nullptr;
1607  }
1608 
1609  CharNormClassifier(Blob, *sample, Results);
1610  delete sample;
1611  RemoveBadMatches(Results);
1612  Results->match.sort(&UnicharRating::SortDescendingRating);
1613 
1614  /* copy the class id's into an string of ambiguities - don't copy if
1615  the correct class is the only class id matched */
1616  Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1617  if (Results->match.size() > 1 ||
1618  (Results->match.size() == 1 &&
1619  Results->match[0].unichar_id != CorrectClass)) {
1620  for (i = 0; i < Results->match.size(); i++)
1621  Ambiguities[i] = Results->match[i].unichar_id;
1622  Ambiguities[i] = -1;
1623  } else {
1624  Ambiguities[0] = -1;
1625  }
1626 
1627  delete Results;
1628  return Ambiguities;
1629 } /* GetAmbiguities */
int UNICHAR_ID
Definition: unichar.h:34
bool classify_nonlinear_norm
Definition: classify.h:452
void RemoveBadMatches(ADAPT_RESULTS *Results)
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:77
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:55
Definition: cluster.h:32

◆ GetCharNormFeature()

int tesseract::Classify::GetCharNormFeature ( const INT_FX_RESULT_STRUCT fx_info,
INT_TEMPLATES  templates,
uint8_t *  pruner_norm_array,
uint8_t *  char_norm_array 
)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob.

The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features.

It then copies the char norm features into the IntFeatures array provided by the caller.

Parameters
templatesused to compute char norm adjustments
pruner_norm_arrayArray of factors from blob normalization process
char_norm_arrayarray to fill with dummy char norm adjustments
fx_infoGlobals:
Returns
Number of features extracted or 0 if an error occurred.

Definition at line 1678 of file adaptmatch.cpp.

1681  {
1682  FEATURE norm_feature = NewFeature(&CharNormDesc);
1683  float baseline = kBlnBaselineOffset;
1684  float scale = MF_SCALE_FACTOR;
1685  norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1686  norm_feature->Params[CharNormLength] =
1687  fx_info.Length * scale / LENGTH_COMPRESSION;
1688  norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1689  norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1690  // Deletes norm_feature.
1691  ComputeCharNormArrays(norm_feature, templates, char_norm_array,
1692  pruner_norm_array);
1693  return IntCastRounded(fx_info.Length / kStandardFeatureLength);
1694 } /* GetCharNormFeature */
const double kStandardFeatureLength
Definition: intfx.h:46
const int kBlnBaselineOffset
Definition: normalis.h:25
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:78
const float MF_SCALE_FACTOR
Definition: mfoutline.h:71
int16_t Ymean
Definition: intfx.h:37
const FEATURE_DESC_STRUCT CharNormDesc
float Params[1]
Definition: ocrfeatures.h:61
int32_t Length
Definition: intfx.h:36
#define LENGTH_COMPRESSION
Definition: normfeat.h:27
int IntCastRounded(double x)
Definition: helpers.h:175

◆ GetClassToDebug()

CLASS_ID tesseract::Classify::GetClassToDebug ( const char *  Prompt,
bool *  adaptive_on,
bool *  pretrained_on,
int *  shape_id 
)

This routine prompts the user with Prompt and waits for the user to enter something in the debug window.

Parameters
Promptprompt to print while waiting for input from window
adaptive_on
pretrained_on
shape_id
Returns
Character entered in the debug window.
Note
Globals: none

Definition at line 1255 of file intproto.cpp.

1256  {
1257  tprintf("%s\n", Prompt);
1258  SVEvent* ev;
1259  SVEventType ev_type;
1260  int unichar_id = INVALID_UNICHAR_ID;
1261  // Wait until a click or popup event.
1262  do {
1263  ev = IntMatchWindow->AwaitEvent(SVET_ANY);
1264  ev_type = ev->type;
1265  if (ev_type == SVET_POPUP) {
1266  if (ev->command_id == IDA_SHAPE_INDEX) {
1267  if (shape_table_ != nullptr) {
1268  *shape_id = atoi(ev->parameter);
1269  *adaptive_on = false;
1270  *pretrained_on = true;
1271  if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
1272  int font_id;
1273  shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id,
1274  &font_id);
1275  tprintf("Shape %d, first unichar=%d, font=%d\n",
1276  *shape_id, unichar_id, font_id);
1277  return unichar_id;
1278  }
1279  tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
1280  } else {
1281  tprintf("No shape table loaded!\n");
1282  }
1283  } else {
1285  unichar_id = unicharset.unichar_to_id(ev->parameter);
1286  if (ev->command_id == IDA_ADAPTIVE) {
1287  *adaptive_on = true;
1288  *pretrained_on = false;
1289  *shape_id = -1;
1290  } else if (ev->command_id == IDA_STATIC) {
1291  *adaptive_on = false;
1292  *pretrained_on = true;
1293  } else {
1294  *adaptive_on = true;
1295  *pretrained_on = true;
1296  }
1297  if (ev->command_id == IDA_ADAPTIVE || shape_table_ == nullptr) {
1298  *shape_id = -1;
1299  return unichar_id;
1300  }
1301  for (int s = 0; s < shape_table_->NumShapes(); ++s) {
1302  if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
1303  tprintf("%s\n", shape_table_->DebugStr(s).string());
1304  }
1305  }
1306  } else {
1307  tprintf("Char class '%s' not found in unicharset",
1308  ev->parameter);
1309  }
1310  }
1311  }
1312  delete ev;
1313  } while (ev_type != SVET_CLICK);
1314  return 0;
1315 } /* GetClassToDebug */
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
ShapeTable * shape_table_
Definition: classify.h:546
SVEventType
Definition: scrollview.h:45
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
int NumShapes() const
Definition: shapetable.h:274
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:147
const char * string() const
Definition: strngs.cpp:194
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:404
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
char * parameter
Definition: scrollview.h:66
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
SVEventType type
Definition: scrollview.h:64
UNICHARSET unicharset
Definition: ccutil.h:73
int command_id
Definition: scrollview.h:71

◆ getDict()

virtual Dict& tesseract::Classify::getDict ( )
inlinevirtual

Reimplemented in tesseract::Tesseract.

Definition at line 107 of file classify.h.

107  {
108  return dict_;
109  }

◆ GetFontinfoId()

int tesseract::Classify::GetFontinfoId ( ADAPT_CLASS  Class,
uint8_t  ConfigId 
)

Definition at line 173 of file adaptive.cpp.

173  {
174  return (ConfigIsPermanent(Class, ConfigId) ?
175  PermConfigFor(Class, ConfigId)->FontinfoId :
176  TempConfigFor(Class, ConfigId)->FontinfoId);
177 }
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:82
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:93

◆ InitAdaptedClass()

void tesseract::Classify::InitAdaptedClass ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
ADAPT_CLASS  Class,
ADAPT_TEMPLATES  Templates 
)

This routine creates a new adapted class and uses Blob as the model for the first config in that class.

Parameters
Blobblob to model new class after
ClassIdid of the class to be initialized
FontinfoIdfont information inferred from pre-trained templates
Classadapted class to be initialized
Templatesadapted templates to add new class to

Globals:

Definition at line 693 of file adaptmatch.cpp.

697  {
698  FEATURE_SET Features;
699  int Fid, Pid;
700  FEATURE Feature;
701  int NumFeatures;
702  TEMP_PROTO TempProto;
703  PROTO Proto;
704  INT_CLASS IClass;
706 
707  classify_norm_method.set_value(baseline);
708  Features = ExtractOutlineFeatures(Blob);
709  NumFeatures = Features->NumFeatures;
710  if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
711  FreeFeatureSet(Features);
712  return;
713  }
714 
715  Config = NewTempConfig(NumFeatures - 1, FontinfoId);
716  TempConfigFor(Class, 0) = Config;
717 
718  /* this is a kludge to construct cutoffs for adapted templates */
719  if (Templates == AdaptedTemplates)
720  BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
721 
722  IClass = ClassForClassId (Templates->Templates, ClassId);
723 
724  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
725  Pid = AddIntProto (IClass);
726  assert (Pid != NO_PROTO);
727 
728  Feature = Features->Features[Fid];
729  TempProto = NewTempProto ();
730  Proto = &(TempProto->Proto);
731 
732  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
733  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
734  instead of the -0.25 to 0.75 used in baseline normalization */
735  Proto->Angle = Feature->Params[OutlineFeatDir];
736  Proto->X = Feature->Params[OutlineFeatX];
737  Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
738  Proto->Length = Feature->Params[OutlineFeatLength];
739  FillABC(Proto);
740 
741  TempProto->ProtoId = Pid;
742  SET_BIT (Config->Protos, Pid);
743 
744  ConvertProto(Proto, Pid, IClass);
745  AddProtoToProtoPruner(Proto, Pid, IClass,
747 
748  Class->TempProtos = push (Class->TempProtos, TempProto);
749  }
750  FreeFeatureSet(Features);
751 
752  AddIntConfig(IClass);
753  ConvertConfig (AllProtosOn, 0, IClass);
754 
756  tprintf("Added new class '%s' with class id %d and %d protos.\n",
757  unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
759  DisplayAdaptedChar(Blob, IClass);
760  }
761 
762  if (IsEmptyAdaptedClass(Class))
763  (Templates->NumNonEmptyClasses)++;
764 } /* InitAdaptedClass */
#define ClassForClassId(T, c)
Definition: intproto.h:178
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:282
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:261
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:78
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:367
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:79
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:463
float X
Definition: protos.h:40
FEATURE Features[1]
Definition: ocrfeatures.h:68
TEMP_PROTO NewTempProto()
Definition: adaptive.cpp:228
PROTO_STRUCT Proto
Definition: adaptive.h:29
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:488
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:946
TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId)
Definition: adaptive.cpp:203
INT_TEMPLATES Templates
Definition: adaptive.h:67
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:84
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:62
void FillABC(PROTO Proto)
Definition: protos.cpp:108
#define SET_BIT(array, bit)
Definition: bitvec.h:55
uint16_t ProtoId
Definition: adaptive.h:28
BIT_VECTOR AllProtosOn
Definition: classify.h:522
LIST push(LIST list, void *element)
Definition: oldlist.cpp:213
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:41
float Length
Definition: protos.h:43
float Params[1]
Definition: ocrfeatures.h:61
float Y
Definition: protos.h:41
int classify_learning_debug_level
Definition: classify.h:455
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
uint16_t NumFeatures
Definition: ocrfeatures.h:66
UNICHARSET unicharset
Definition: ccutil.h:73
CLUSTERCONFIG Config
float Angle
Definition: protos.h:42
#define NO_PROTO
Definition: matchdefs.h:41

◆ InitAdaptiveClassifier()

void tesseract::Classify::InitAdaptiveClassifier ( TessdataManager mgr)

This routine reads in the training information needed by the adaptive classifier and saves it into global variables. Parameters: load_pre_trained_templates Indicates whether the pre-trained templates (inttemp, normproto and pffmtable components) should be loaded. Should only be set to true if the necessary classifier components are present in the [lang].traineddata file. Globals: BuiltInTemplatesFile file to get built-in temps from BuiltInCutoffsFile file to get avg. feat per class from classify_use_pre_adapted_templates enables use of pre-adapted templates

Definition at line 527 of file adaptmatch.cpp.

527  {
529  return;
530  if (AllProtosOn != nullptr)
531  EndAdaptiveClassifier(); // Don't leak with multiple inits.
532 
533  // If there is no language_data_path_prefix, the classifier will be
534  // adaptive only.
535  if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
536  TFile fp;
537  ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
539 
540  if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
541  shape_table_ = new ShapeTable(unicharset);
542  if (!shape_table_->DeSerialize(&fp)) {
543  tprintf("Error loading shape table!\n");
544  delete shape_table_;
545  shape_table_ = nullptr;
546  }
547  }
548 
549  ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
550  ReadNewCutoffs(&fp, CharNormCutoffs);
551 
552  ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
553  NormProtos = ReadNormProtos(&fp);
554  static_classifier_ = new TessClassifier(false, this);
555  }
556 
557  InitIntegerFX();
558 
559  AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
560  AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
561  AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
562  TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
563  set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
564  set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
565  zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
566 
567  for (uint16_t& BaselineCutoff : BaselineCutoffs) {
568  BaselineCutoff = 0;
569  }
570 
572  TFile fp;
573  STRING Filename;
574 
575  Filename = imagefile;
576  Filename += ADAPT_TEMPLATE_SUFFIX;
577  if (!fp.Open(Filename.string(), nullptr)) {
579  } else {
580  cprintf("\nReading pre-adapted templates from %s ...\n",
581  Filename.string());
582  fflush(stdout);
584  cprintf("\n");
586 
587  for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
588  BaselineCutoffs[i] = CharNormCutoffs[i];
589  }
590  }
591  } else {
592  if (AdaptedTemplates != nullptr)
595  }
596 } /* InitAdaptiveClassifier */
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool classify_enable_adaptive_matcher
Definition: classify.h:445
ShapeTable * shape_table_
Definition: classify.h:546
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:182
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:244
BIT_VECTOR TempProtoMask
Definition: classify.h:525
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:190
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:459
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:718
BIT_VECTOR AllConfigsOff
Definition: classify.h:524
const char * string() const
Definition: strngs.cpp:194
INT_TEMPLATES Templates
Definition: adaptive.h:67
void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs)
Definition: cutoffs.cpp:41
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
int32_t length() const
Definition: strngs.cpp:189
void InitIntegerFX()
Definition: intfx.cpp:49
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:151
BIT_VECTOR AllProtosOn
Definition: classify.h:522
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
STRING language_data_path_prefix
Definition: ccutil.h:72
Definition: strngs.h:45
#define MAX_NUM_PROTOS
Definition: intproto.h:48
STRING imagefile
Definition: ccutil.h:77
bool classify_use_pre_adapted_templates
Definition: classify.h:447
UNICHARSET unicharset
Definition: ccutil.h:73
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
#define ASSERT_HOST(x)
Definition: errcode.h:88
NORM_PROTOS * NormProtos
Definition: classify.h:527
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:332
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:75

◆ LargeSpeckle()

bool tesseract::Classify::LargeSpeckle ( const TBLOB blob)

Definition at line 242 of file classify.cpp.

242  {
243  double speckle_size = kBlnXHeight * speckle_large_max_size;
244  TBOX bbox = blob.bounding_box();
245  return bbox.width() < speckle_size && bbox.height() < speckle_size;
246 }
int16_t width() const
Definition: rect.h:115
double speckle_large_max_size
Definition: classify.h:509
const int kBlnXHeight
Definition: normalis.h:24
int16_t height() const
Definition: rect.h:108
Definition: rect.h:34
TBOX bounding_box() const
Definition: blobs.cpp:468

◆ LearnBlob()

void tesseract::Classify::LearnBlob ( const STRING fontname,
TBLOB Blob,
const DENORM cn_denorm,
const INT_FX_RESULT_STRUCT fx_info,
const char *  blob_text 
)

Definition at line 70 of file blobclass.cpp.

73  {
75  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
76  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
77  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
78  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
79 
80  if (ValidCharDescription(feature_defs_, CharDesc)) {
81  // Label the features with a class name and font name.
82  tr_file_data_ += "\n";
83  tr_file_data_ += fontname;
84  tr_file_data_ += " ";
85  tr_file_data_ += blob_text;
86  tr_file_data_ += "\n";
87 
88  // write micro-features to file and clean up
89  WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
90  } else {
91  tprintf("Blob learned was invalid!\n");
92  }
93  FreeCharDescription(CharDesc);
94 } // LearnBlob
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info)
Definition: normfeat.cpp:61
FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm)
Definition: mf.cpp:43
CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs)
Definition: featdefs.cpp:148
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc)
Definition: featdefs.cpp:195
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:541
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:217
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc, STRING *str)
Definition: featdefs.cpp:174
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:129
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:41
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:247

◆ LearnPieces()

void tesseract::Classify::LearnPieces ( const char *  fontname,
int  start,
int  length,
float  threshold,
CharSegmentationType  segmentation,
const char *  correct_text,
WERD_RES word 
)

Definition at line 374 of file adaptmatch.cpp.

376  {
377  // TODO(daria) Remove/modify this if/when we want
378  // to train and/or adapt to n-grams.
379  if (segmentation != CST_WHOLE &&
380  (segmentation != CST_FRAGMENT || disable_character_fragments))
381  return;
382 
383  if (length > 1) {
384  SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
385  start + length - 1);
386  }
387  TBLOB* blob = word->chopped_word->blobs[start];
388  // Rotate the blob if needed for classification.
389  TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
390  if (rotated_blob == nullptr)
391  rotated_blob = blob;
392 
393  #ifndef GRAPHICS_DISABLED
394  // Draw debug windows showing the blob that is being learned if needed.
395  if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
396  RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
397  word->chopped_word->bounding_box());
398  rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
399  learn_debug_win_->Update();
400  window_wait(learn_debug_win_);
401  }
402  if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
403  ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
404  blob->plot(learn_fragments_debug_win_,
406  learn_fragments_debug_win_->Update();
407  }
408  #endif // GRAPHICS_DISABLED
409 
410  if (fontname != nullptr) {
411  classify_norm_method.set_value(character); // force char norm spc 30/11/93
412  tess_bn_matching.set_value(false); // turn it off
413  tess_cn_matching.set_value(false);
414  DENORM bl_denorm, cn_denorm;
415  INT_FX_RESULT_STRUCT fx_info;
417  &bl_denorm, &cn_denorm, &fx_info);
418  LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
419  } else if (unicharset.contains_unichar(correct_text)) {
420  UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
421  int font_id = word->fontinfo != nullptr
422  ? fontinfo_table_.get_id(*word->fontinfo)
423  : 0;
425  tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
426  unicharset.id_to_unichar(class_id), threshold, font_id);
427  // If filename is not nullptr we are doing recognition
428  // (as opposed to training), so we must have already set word fonts.
429  AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
430  if (BackupAdaptedTemplates != nullptr) {
431  // Adapt the backup templates too. They will be used if the primary gets
432  // too full.
433  AdaptToChar(rotated_blob, class_id, font_id, threshold,
435  }
436  } else if (classify_debug_level >= 1) {
437  tprintf("Can't adapt to %s not in unicharset\n", correct_text);
438  }
439  if (rotated_blob != blob) {
440  delete rotated_blob;
441  }
442 
443  SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
444  start + length - 1);
445 } // LearnPieces.
int UNICHAR_ID
Definition: unichar.h:34
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:853
static void Update()
Definition: scrollview.cpp:709
const FontInfo * fontinfo
Definition: pageres.h:309
bool classify_debug_character_fragments
Definition: classify.h:491
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
bool classify_nonlinear_norm
Definition: classify.h:452
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
Definition: blobs.h:284
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:210
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:226
TWERD * chopped_word
Definition: pageres.h:212
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:519
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
char * classify_learn_debug_str
Definition: classify.h:495
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
int classify_learning_debug_level
Definition: classify.h:455
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
UNICHARSET unicharset
Definition: ccutil.h:73
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:70
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:346
bool disable_character_fragments
Definition: classify.h:486
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:129
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:188
#define ASSERT_HOST(x)
Definition: errcode.h:88
TBOX bounding_box() const
Definition: blobs.cpp:861
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:510

◆ LearnWord()

void tesseract::Classify::LearnWord ( const char *  fontname,
WERD_RES word 
)

Definition at line 250 of file adaptmatch.cpp.

250  {
251  int word_len = word->correct_text.size();
252  if (word_len == 0) return;
253 
254  float* thresholds = nullptr;
255  if (fontname == nullptr) {
256  // Adaption mode.
257  if (!EnableLearning || word->best_choice == nullptr)
258  return; // Can't or won't adapt.
259 
261  tprintf("\n\nAdapting to word = %s\n",
262  word->best_choice->debug_string().string());
263  thresholds = new float[word_len];
267  matcher_rating_margin, thresholds);
268  }
269  int start_blob = 0;
270 
271  #ifndef GRAPHICS_DISABLED
273  if (learn_fragmented_word_debug_win_ != nullptr) {
274  window_wait(learn_fragmented_word_debug_win_);
275  }
276  RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
277  word->chopped_word->bounding_box());
278  RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
279  word->chopped_word->bounding_box());
280  word->chopped_word->plot(learn_fragmented_word_debug_win_);
282  }
283  #endif // GRAPHICS_DISABLED
284 
285  for (int ch = 0; ch < word_len; ++ch) {
287  tprintf("\nLearning %s\n", word->correct_text[ch].string());
288  }
289  if (word->correct_text[ch].length() > 0) {
290  float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
291 
292  LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
293  CST_WHOLE, word->correct_text[ch].string(), word);
294 
295  if (word->best_state[ch] > 1 && !disable_character_fragments) {
296  // Check that the character breaks into meaningful fragments
297  // that each match a whole character with at least
298  // classify_character_fragments_garbage_certainty_threshold
299  bool garbage = false;
300  int frag;
301  for (frag = 0; frag < word->best_state[ch]; ++frag) {
302  TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
304  garbage |= LooksLikeGarbage(frag_blob);
305  }
306  }
307  // Learn the fragments.
308  if (!garbage) {
309  bool pieces_all_natural = word->PiecesAllNatural(start_blob,
310  word->best_state[ch]);
311  if (pieces_all_natural || !prioritize_division) {
312  for (frag = 0; frag < word->best_state[ch]; ++frag) {
313  GenericVector<STRING> tokens;
314  word->correct_text[ch].split(' ', &tokens);
315 
316  tokens[0] = CHAR_FRAGMENT::to_string(
317  tokens[0].string(), frag, word->best_state[ch],
318  pieces_all_natural);
319 
320  STRING full_string;
321  for (int i = 0; i < tokens.size(); i++) {
322  full_string += tokens[i];
323  if (i != tokens.size() - 1)
324  full_string += ' ';
325  }
326  LearnPieces(fontname, start_blob + frag, 1, threshold,
327  CST_FRAGMENT, full_string.string(), word);
328  }
329  }
330  }
331  }
332 
333  // TODO(rays): re-enable this part of the code when we switch to the
334  // new classifier that needs to see examples of garbage.
335  /*
336  if (word->best_state[ch] > 1) {
337  // If the next blob is good, make junk with the rightmost fragment.
338  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
339  LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
340  word->best_state[ch + 1] + 1,
341  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
342  }
343  // If the previous blob is good, make junk with the leftmost fragment.
344  if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
345  LearnPieces(fontname, start_blob - word->best_state[ch - 1],
346  word->best_state[ch - 1] + 1,
347  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
348  }
349  }
350  // If the next blob is good, make a join with it.
351  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
352  STRING joined_text = word->correct_text[ch];
353  joined_text += word->correct_text[ch + 1];
354  LearnPieces(fontname, start_blob,
355  word->best_state[ch] + word->best_state[ch + 1],
356  threshold, CST_NGRAM, joined_text.string(), word);
357  }
358  */
359  }
360  start_blob += word->best_state[ch];
361  }
362  delete [] thresholds;
363 } // LearnWord.
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
static void Update()
Definition: scrollview.cpp:709
bool classify_debug_character_fragments
Definition: classify.h:491
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:561
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1078
Definition: blobs.h:284
double matcher_rating_margin
Definition: classify.h:460
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:374
const char * string() const
Definition: strngs.cpp:194
double matcher_good_threshold
Definition: classify.h:456
int length() const
Definition: genericvector.h:86
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:226
TWERD * chopped_word
Definition: pageres.h:212
void plot(ScrollView *window)
Definition: blobs.cpp:897
double certainty_scale
Definition: classify.h:473
STRING to_string() const
Definition: unicharset.h:79
GenericVector< int > best_state
Definition: pageres.h:285
int classify_learning_debug_level
Definition: classify.h:455
Definition: strngs.h:45
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:489
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
bool LooksLikeGarbage(TBLOB *blob)
int size() const
Definition: genericvector.h:72
double matcher_perfect_threshold
Definition: classify.h:458
WERD_CHOICE * best_choice
Definition: pageres.h:241
bool prioritize_division
Definition: classify.h:428
bool disable_character_fragments
Definition: classify.h:486
TBOX bounding_box() const
Definition: blobs.cpp:861
const STRING debug_string() const
Definition: ratngs.h:495
GenericVector< STRING > correct_text
Definition: pageres.h:289

◆ LooksLikeGarbage()

bool tesseract::Classify::LooksLikeGarbage ( TBLOB blob)

Definition at line 1633 of file adaptmatch.cpp.

1633  {
1634  auto *ratings = new BLOB_CHOICE_LIST();
1635  AdaptiveClassifier(blob, ratings);
1636  BLOB_CHOICE_IT ratings_it(ratings);
1639  print_ratings_list("======================\nLooksLikeGarbage() got ",
1640  ratings, unicharset);
1641  }
1642  for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
1643  ratings_it.forward()) {
1644  if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1645  continue;
1646  }
1647  float certainty = ratings_it.data()->certainty();
1648  delete ratings;
1649  return certainty <
1651  }
1652  delete ratings;
1653  return true; // no whole characters in ratings
1654 }
bool classify_debug_character_fragments
Definition: classify.h:491
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:191
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
virtual Dict & getDict()
Definition: classify.h:107
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:489
UNICHARSET unicharset
Definition: ccutil.h:73

◆ MakeNewTemporaryConfig()

int tesseract::Classify::MakeNewTemporaryConfig ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  FontinfoId,
int  NumFeatures,
INT_FEATURE_ARRAY  Features,
FEATURE_SET  FloatFeatures 
)
Parameters
Templatesadapted templates to add new config to
ClassIdclass id to associate with new config
FontinfoIdfont information inferred from pre-trained templates
NumFeaturesnumber of features in IntFeatures
Featuresfeatures describing model for new config
FloatFeaturesfloating-pt representation of features
Returns
The id of the new config created, a negative integer in case of error.

Definition at line 1740 of file adaptmatch.cpp.

1745  {
1746  INT_CLASS IClass;
1747  ADAPT_CLASS Class;
1748  PROTO_ID OldProtos[MAX_NUM_PROTOS];
1749  FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1750  int NumOldProtos;
1751  int NumBadFeatures;
1752  int MaxProtoId, OldMaxProtoId;
1753  int MaskSize;
1754  int ConfigId;
1756  int i;
1757  int debug_level = NO_DEBUG;
1758 
1760  debug_level =
1762 
1763  IClass = ClassForClassId(Templates->Templates, ClassId);
1764  Class = Templates->Class[ClassId];
1765 
1766  if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1767  ++NumAdaptationsFailed;
1769  cprintf("Cannot make new temporary config: maximum number exceeded.\n");
1770  return -1;
1771  }
1772 
1773  OldMaxProtoId = IClass->NumProtos - 1;
1774 
1775  NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
1776  NumFeatures, Features,
1777  OldProtos, classify_adapt_proto_threshold,
1778  debug_level);
1779 
1780  MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1781  zero_all_bits(TempProtoMask, MaskSize);
1782  for (i = 0; i < NumOldProtos; i++)
1783  SET_BIT(TempProtoMask, OldProtos[i]);
1784 
1785  NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
1786  NumFeatures, Features,
1787  BadFeatures,
1789  debug_level);
1790 
1791  MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
1792  IClass, Class, TempProtoMask);
1793  if (MaxProtoId == NO_PROTO) {
1794  ++NumAdaptationsFailed;
1796  cprintf("Cannot make new temp protos: maximum number exceeded.\n");
1797  return -1;
1798  }
1799 
1800  ConfigId = AddIntConfig(IClass);
1801  ConvertConfig(TempProtoMask, ConfigId, IClass);
1802  Config = NewTempConfig(MaxProtoId, FontinfoId);
1803  TempConfigFor(Class, ConfigId) = Config;
1804  copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1805 
1807  cprintf("Making new temp config %d fontinfo id %d"
1808  " using %d old and %d new protos.\n",
1809  ConfigId, Config->FontinfoId,
1810  NumOldProtos, MaxProtoId - OldMaxProtoId);
1811 
1812  return ConfigId;
1813 } /* MakeNewTemporaryConfig */
uint16_t NumProtos
Definition: intproto.h:106
#define ClassForClassId(T, c)
Definition: intproto.h:178
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:261
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
int classify_adapt_feature_threshold
Definition: classify.h:483
BIT_VECTOR TempProtoMask
Definition: classify.h:525
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:463
int classify_adapt_proto_threshold
Definition: classify.h:481
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:129
int16_t PROTO_ID
Definition: matchdefs.h:40
BIT_VECTOR AllConfigsOff
Definition: classify.h:524
TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId)
Definition: adaptive.cpp:203
INT_TEMPLATES Templates
Definition: adaptive.h:67
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
uint8_t FEATURE_ID
Definition: matchdefs.h:46
#define PRINT_PROTO_MATCHES
Definition: intproto.h:192
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
IntegerMatcher im_
Definition: classify.h:540
#define SET_BIT(array, bit)
Definition: bitvec.h:55
BIT_VECTOR AllProtosOn
Definition: classify.h:522
int FindGoodProtos(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:589
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:188
int classify_learning_debug_level
Definition: classify.h:455
#define MAX_NUM_PROTOS
Definition: intproto.h:48
uint8_t NumConfigs
Definition: intproto.h:108
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:191
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
CLUSTERCONFIG Config
#define NO_DEBUG
Definition: adaptmatch.cpp:79
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
int FindBadFeatures(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:657
#define NO_PROTO
Definition: matchdefs.h:41

◆ MakeNewTempProtos()

PROTO_ID tesseract::Classify::MakeNewTempProtos ( FEATURE_SET  Features,
int  NumBadFeat,
FEATURE_ID  BadFeat[],
INT_CLASS  IClass,
ADAPT_CLASS  Class,
BIT_VECTOR  TempProtoMask 
)

This routine finds sets of sequential bad features that all have the same angle and converts each set into a new temporary proto. The temp proto is added to the proto pruner for IClass, pushed onto the list of temp protos in Class, and added to TempProtoMask.

Parameters
Featuresfloating-pt features describing new character
NumBadFeatnumber of bad features to turn into protos
BadFeatfeature id's of bad features
IClassinteger class templates to add new protos to
Classadapted class templates to add new protos to
TempProtoMaskproto mask to add new protos to

Globals: none

Returns
Max proto id in class after all protos have been added.

Definition at line 1834 of file adaptmatch.cpp.

1839  {
1840  FEATURE_ID *ProtoStart;
1841  FEATURE_ID *ProtoEnd;
1842  FEATURE_ID *LastBad;
1843  TEMP_PROTO TempProto;
1844  PROTO Proto;
1845  FEATURE F1, F2;
1846  float X1, X2, Y1, Y2;
1847  float A1, A2, AngleDelta;
1848  float SegmentLength;
1849  PROTO_ID Pid;
1850 
1851  for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
1852  ProtoStart < LastBad; ProtoStart = ProtoEnd) {
1853  F1 = Features->Features[*ProtoStart];
1854  X1 = F1->Params[PicoFeatX];
1855  Y1 = F1->Params[PicoFeatY];
1856  A1 = F1->Params[PicoFeatDir];
1857 
1858  for (ProtoEnd = ProtoStart + 1,
1859  SegmentLength = GetPicoFeatureLength();
1860  ProtoEnd < LastBad;
1861  ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1862  F2 = Features->Features[*ProtoEnd];
1863  X2 = F2->Params[PicoFeatX];
1864  Y2 = F2->Params[PicoFeatY];
1865  A2 = F2->Params[PicoFeatDir];
1866 
1867  AngleDelta = fabs(A1 - A2);
1868  if (AngleDelta > 0.5)
1869  AngleDelta = 1.0 - AngleDelta;
1870 
1871  if (AngleDelta > matcher_clustering_max_angle_delta ||
1872  fabs(X1 - X2) > SegmentLength ||
1873  fabs(Y1 - Y2) > SegmentLength)
1874  break;
1875  }
1876 
1877  F2 = Features->Features[*(ProtoEnd - 1)];
1878  X2 = F2->Params[PicoFeatX];
1879  Y2 = F2->Params[PicoFeatY];
1880  A2 = F2->Params[PicoFeatDir];
1881 
1882  Pid = AddIntProto(IClass);
1883  if (Pid == NO_PROTO)
1884  return (NO_PROTO);
1885 
1886  TempProto = NewTempProto();
1887  Proto = &(TempProto->Proto);
1888 
1889  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1890  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1891  instead of the -0.25 to 0.75 used in baseline normalization */
1892  Proto->Length = SegmentLength;
1893  Proto->Angle = A1;
1894  Proto->X = (X1 + X2) / 2.0;
1895  Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
1896  FillABC(Proto);
1897 
1898  TempProto->ProtoId = Pid;
1899  SET_BIT(TempProtoMask, Pid);
1900 
1901  ConvertProto(Proto, Pid, IClass);
1902  AddProtoToProtoPruner(Proto, Pid, IClass,
1904 
1905  Class->TempProtos = push(Class->TempProtos, TempProto);
1906  }
1907  return IClass->NumProtos - 1;
1908 } /* MakeNewTempProtos */
uint16_t NumProtos
Definition: intproto.h:106
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:282
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:367
BIT_VECTOR TempProtoMask
Definition: classify.h:525
float X
Definition: protos.h:40
double matcher_clustering_max_angle_delta
Definition: classify.h:468
FEATURE Features[1]
Definition: ocrfeatures.h:68
TEMP_PROTO NewTempProto()
Definition: adaptive.cpp:228
PROTO_STRUCT Proto
Definition: adaptive.h:29
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:488
int16_t PROTO_ID
Definition: matchdefs.h:40
uint8_t FEATURE_ID
Definition: matchdefs.h:46
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:84
void FillABC(PROTO Proto)
Definition: protos.cpp:108
#define GetPicoFeatureLength()
Definition: picofeat.h:57
#define SET_BIT(array, bit)
Definition: bitvec.h:55
uint16_t ProtoId
Definition: adaptive.h:28
LIST push(LIST list, void *element)
Definition: oldlist.cpp:213
float Length
Definition: protos.h:43
float Params[1]
Definition: ocrfeatures.h:61
float Y
Definition: protos.h:41
int classify_learning_debug_level
Definition: classify.h:455
float Angle
Definition: protos.h:42
#define NO_PROTO
Definition: matchdefs.h:41

◆ MakePermanent()

void tesseract::Classify::MakePermanent ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  ConfigId,
TBLOB Blob 
)
Parameters
Templatescurrent set of adaptive templates
ClassIdclass containing config to be made permanent
ConfigIdconfig to be made permanent
Blobcurrent blob being adapted to

Globals: none

Definition at line 1920 of file adaptmatch.cpp.

1923  {
1924  UNICHAR_ID *Ambigs;
1926  ADAPT_CLASS Class;
1927  PROTO_KEY ProtoKey;
1928 
1929  Class = Templates->Class[ClassId];
1930  Config = TempConfigFor(Class, ConfigId);
1931 
1932  MakeConfigPermanent(Class, ConfigId);
1933  if (Class->NumPermConfigs == 0)
1934  Templates->NumPermClasses++;
1935  Class->NumPermConfigs++;
1936 
1937  // Initialize permanent config.
1938  Ambigs = GetAmbiguities(Blob, ClassId);
1939  auto Perm = static_cast<PERM_CONFIG>(malloc(sizeof(PERM_CONFIG_STRUCT)));
1940  Perm->Ambigs = Ambigs;
1941  Perm->FontinfoId = Config->FontinfoId;
1942 
1943  // Free memory associated with temporary config (since ADAPTED_CONFIG
1944  // is a union we need to clean up before we record permanent config).
1945  ProtoKey.Templates = Templates;
1946  ProtoKey.ClassId = ClassId;
1947  ProtoKey.ConfigId = ConfigId;
1948  Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1950 
1951  // Record permanent config.
1952  PermConfigFor(Class, ConfigId) = Perm;
1953 
1954  if (classify_learning_debug_level >= 1) {
1955  tprintf("Making config %d for %s (ClassId %d) permanent:"
1956  " fontinfo id %d, ambiguities '",
1957  ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
1958  ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
1959  for (UNICHAR_ID *AmbigsPointer = Ambigs;
1960  *AmbigsPointer >= 0; ++AmbigsPointer)
1961  tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1962  tprintf("'.\n");
1963  }
1964 } /* MakePermanent */
int UNICHAR_ID
Definition: unichar.h:34
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
int MakeTempProtoPerm(void *item1, void *item2)
CLASS_ID ClassId
Definition: adaptmatch.cpp:124
uint8_t NumPermClasses
Definition: adaptive.h:69
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:110
void FreeTempConfig(TEMP_CONFIG Config)
Definition: adaptive.cpp:74
UNICHAR_ID * Ambigs
Definition: adaptive.h:45
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:93
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:85
ADAPT_TEMPLATES Templates
Definition: adaptmatch.cpp:123
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
virtual Dict & getDict()
Definition: classify.h:107
int classify_learning_debug_level
Definition: classify.h:455
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
uint8_t NumPermConfigs
Definition: adaptive.h:56
UNICHARSET unicharset
Definition: ccutil.h:73
CLUSTERCONFIG Config

◆ MasterMatcher()

void tesseract::Classify::MasterMatcher ( INT_TEMPLATES  templates,
int16_t  num_features,
const INT_FEATURE_STRUCT features,
const uint8_t *  norm_factors,
ADAPT_CLASS classes,
int  debug,
int  matcher_multiplier,
const TBOX blob_box,
const GenericVector< CP_RESULT_STRUCT > &  results,
ADAPT_RESULTS final_results 
)

Factored-out calls to IntegerMatcher based on class pruner results. Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.

Definition at line 1088 of file adaptmatch.cpp.

1097  {
1098  int top = blob_box.top();
1099  int bottom = blob_box.bottom();
1100  UnicharRating int_result;
1101  for (int c = 0; c < results.size(); c++) {
1102  CLASS_ID class_id = results[c].Class;
1103  BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos
1104  : AllProtosOn;
1105  BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs
1106  : AllConfigsOn;
1107 
1108  int_result.unichar_id = class_id;
1109  im_.Match(ClassForClassId(templates, class_id),
1110  protos, configs,
1111  num_features, features,
1112  &int_result, classify_adapt_feature_threshold, debug,
1114  bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1115  ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top,
1116  results[c].Rating,
1117  final_results->BlobLength,
1118  matcher_multiplier, norm_factors,
1119  &int_result, final_results);
1120  }
1121 }
BIT_VECTOR PermConfigs
Definition: adaptive.h:60
#define ClassForClassId(T, c)
Definition: intproto.h:178
int classify_adapt_feature_threshold
Definition: classify.h:483
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
int32_t BlobLength
Definition: adaptmatch.cpp:92
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
BIT_VECTOR PermProtos
Definition: adaptive.h:59
IntegerMatcher im_
Definition: classify.h:540
BIT_VECTOR AllProtosOn
Definition: classify.h:522
int16_t bottom() const
Definition: rect.h:65
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:511
bool matcher_debug_separate_windows
Definition: classify.h:494
int size() const
Definition: genericvector.h:72
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
int16_t top() const
Definition: rect.h:58

◆ NewAdaptedTemplates()

ADAPT_TEMPLATES tesseract::Classify::NewAdaptedTemplates ( bool  InitFromUnicharset)

Allocates memory for adapted tempates. each char in unicharset to the newly created templates

Parameters
InitFromUnicharsetif true, add an empty class for
Returns
Ptr to new adapted templates.
Note
Globals: none

Definition at line 151 of file adaptive.cpp.

151  {
152  ADAPT_TEMPLATES Templates;
153 
154  Templates = static_cast<ADAPT_TEMPLATES>(Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT)));
155 
156  Templates->Templates = NewIntTemplates ();
157  Templates->NumPermClasses = 0;
158  Templates->NumNonEmptyClasses = 0;
159 
160  /* Insert an empty class for each unichar id in unicharset */
161  for (int i = 0; i < MAX_NUM_CLASSES; i++) {
162  Templates->Class[i] = nullptr;
163  if (InitFromUnicharset && i < unicharset.size()) {
164  AddAdaptedClass(Templates, NewAdaptedClass(), i);
165  }
166  }
167 
168  return (Templates);
169 
170 } /* NewAdaptedTemplates */
int size() const
Definition: unicharset.h:341
INT_TEMPLATES NewIntTemplates()
Definition: intproto.cpp:682
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
uint8_t NumPermClasses
Definition: adaptive.h:69
INT_TEMPLATES Templates
Definition: adaptive.h:67
ADAPT_CLASS NewAdaptedClass()
Definition: adaptive.cpp:102
void AddAdaptedClass(ADAPT_TEMPLATES Templates, ADAPT_CLASS Class, CLASS_ID ClassId)
Definition: adaptive.cpp:45
void * Emalloc(int Size)
Definition: emalloc.cpp:31
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
UNICHARSET unicharset
Definition: ccutil.h:73

◆ NormalizeOutlines()

void tesseract::Classify::NormalizeOutlines ( LIST  Outlines,
float *  XScale,
float *  YScale 
)

This routine normalizes every outline in Outlines according to the currently selected normalization method. It also returns the scale factors that it used to do this scaling. The scale factors returned represent the x and y sizes in the normalized coordinate system that correspond to 1 pixel in the original coordinate system. Outlines are changed and XScale and YScale are updated.

Globals:

  • classify_norm_method method being used for normalization
  • classify_char_norm_range map radius of gyration to this value
    Parameters
    Outlineslist of outlines to be normalized
    XScalex-direction scale factor used by routine
    YScaley-direction scale factor used by routine

Definition at line 276 of file mfoutline.cpp.

278  {
279  MFOUTLINE Outline;
280 
281  switch (classify_norm_method) {
282  case character:
283  ASSERT_HOST(!"How did NormalizeOutlines get called in character mode?");
284  break;
285 
286  case baseline:
287  iterate(Outlines) {
288  Outline = static_cast<MFOUTLINE>first_node(Outlines);
289  NormalizeOutline(Outline, 0.0);
290  }
291  *XScale = *YScale = MF_SCALE_FACTOR;
292  break;
293  }
294 } /* NormalizeOutlines */
#define iterate(l)
Definition: oldlist.h:101
const float MF_SCALE_FACTOR
Definition: mfoutline.h:71
void NormalizeOutline(MFOUTLINE Outline, float XOrigin)
Definition: mfoutline.cpp:242
#define first_node(l)
Definition: oldlist.h:92
#define ASSERT_HOST(x)
Definition: errcode.h:88

◆ PrintAdaptedTemplates()

void tesseract::Classify::PrintAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine prints a summary of the adapted templates in Templates to File.

Parameters
Fileopen text file to print Templates to
Templatesadapted templates to print to File
Note
Globals: none

Definition at line 244 of file adaptive.cpp.

244  {
245  INT_CLASS IClass;
246  ADAPT_CLASS AClass;
247 
248  fprintf (File, "\n\nSUMMARY OF ADAPTED TEMPLATES:\n\n");
249  fprintf (File, "Num classes = %d; Num permanent classes = %d\n\n",
250  Templates->NumNonEmptyClasses, Templates->NumPermClasses);
251  fprintf (File, " Id NC NPC NP NPP\n");
252  fprintf (File, "------------------------\n");
253 
254  for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
255  IClass = Templates->Templates->Class[i];
256  AClass = Templates->Class[i];
257  if (!IsEmptyAdaptedClass (AClass)) {
258  fprintf (File, "%5d %s %3d %3d %3d %3d\n",
260  IClass->NumConfigs, AClass->NumPermConfigs,
261  IClass->NumProtos,
262  IClass->NumProtos - count (AClass->TempProtos));
263  }
264  }
265  fprintf (File, "\n");
266 
267 } /* PrintAdaptedTemplates */
uint16_t NumProtos
Definition: intproto.h:106
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:79
uint8_t NumPermClasses
Definition: adaptive.h:69
INT_TEMPLATES Templates
Definition: adaptive.h:67
int count(LIST var_list)
Definition: oldlist.cpp:95
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
uint8_t NumConfigs
Definition: intproto.h:108
uint8_t NumPermConfigs
Definition: adaptive.h:56
UNICHARSET unicharset
Definition: ccutil.h:73

◆ PrintAdaptiveMatchResults()

void tesseract::Classify::PrintAdaptiveMatchResults ( const ADAPT_RESULTS results)

This routine writes the matches in Results to File.

Parameters
resultsmatch results to write to File

Globals: none

Definition at line 2013 of file adaptmatch.cpp.

2013  {
2014  for (int i = 0; i < results.match.size(); ++i) {
2015  tprintf("%s ", unicharset.debug_str(results.match[i].unichar_id).string());
2016  results.match[i].Print();
2017  }
2018 } /* PrintAdaptiveMatchResults */
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
const char * string() const
Definition: strngs.cpp:194
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
UNICHARSET unicharset
Definition: ccutil.h:73
int size() const
Definition: genericvector.h:72

◆ PruneClasses()

int tesseract::Classify::PruneClasses ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
int  keep_this,
const INT_FEATURE_STRUCT features,
const uint8_t *  normalization_factors,
const uint16_t *  expected_num_features,
GenericVector< CP_RESULT_STRUCT > *  results 
)

Runs the class pruner from int_templates on the given features, returning the number of classes output in results.

Parameters
int_templatesClass pruner tables
num_featuresNumber of features in blob
featuresArray of features
normalization_factorsArray of fudge factors from blob normalization process (by CLASS_INDEX)
expected_num_featuresArray of expected number of features for each class (by CLASS_INDEX)
resultsSorted Array of pruned classes. Must be an array of size at least int_templates->NumClasses.
keep_this

Definition at line 452 of file intmatcher.cpp.

457  {
458  ClassPruner pruner(int_templates->NumClasses);
459  // Compute initial match scores for all classes.
460  pruner.ComputeScores(int_templates, num_features, features);
461  // Adjust match scores for number of expected features.
462  pruner.AdjustForExpectedNumFeatures(expected_num_features,
464  // Apply disabled classes in unicharset - only works without a shape_table.
465  if (shape_table_ == nullptr)
466  pruner.DisableDisabledClasses(unicharset);
467  // If fragments are disabled, remove them, also only without a shape table.
468  if (disable_character_fragments && shape_table_ == nullptr)
469  pruner.DisableFragments(unicharset);
470 
471  // If we have good x-heights, apply the given normalization factors.
472  if (normalization_factors != nullptr) {
473  pruner.NormalizeForXheight(classify_class_pruner_multiplier,
474  normalization_factors);
475  } else {
476  pruner.NoNormalization();
477  }
478  // Do the actual pruning and sort the short-list.
479  pruner.PruneAndSort(classify_class_pruner_threshold, keep_this,
480  shape_table_ == nullptr, unicharset);
481 
482  if (classify_debug_level > 2) {
483  pruner.DebugMatch(*this, int_templates, features);
484  }
485  if (classify_debug_level > 1) {
486  pruner.SummarizeResult(*this, int_templates, expected_num_features,
488  normalization_factors);
489  }
490  // Convert to the expected output format.
491  return pruner.SetupResults(results);
492 }
ShapeTable * shape_table_
Definition: classify.h:546
int classify_class_pruner_threshold
Definition: classify.h:499
int classify_cp_cutoff_strength
Definition: classify.h:503
int classify_class_pruner_multiplier
Definition: classify.h:501
UNICHARSET unicharset
Definition: ccutil.h:73
bool disable_character_fragments
Definition: classify.h:486

◆ ReadAdaptedTemplates()

ADAPT_TEMPLATES tesseract::Classify::ReadAdaptedTemplates ( TFile fp)

Read a set of adapted templates from file and return a ptr to the templates.

Parameters
fpopen text file to read adapted templates from
Returns
Ptr to adapted templates read from file.
Note
Globals: none

Definition at line 332 of file adaptive.cpp.

332  {
333  ADAPT_TEMPLATES Templates;
334 
335  /* first read the high level adaptive template struct */
336  Templates = static_cast<ADAPT_TEMPLATES>(Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT)));
337  fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);
338 
339  /* then read in the basic integer templates */
340  Templates->Templates = ReadIntTemplates(fp);
341 
342  /* then read in the adaptive info for each class */
343  for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
344  Templates->Class[i] = ReadAdaptedClass(fp);
345  }
346  return (Templates);
347 
348 } /* ReadAdaptedTemplates */
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:718
INT_TEMPLATES Templates
Definition: adaptive.h:67
ADAPT_CLASS ReadAdaptedClass(TFile *fp)
Definition: adaptive.cpp:281
void * Emalloc(int Size)
Definition: emalloc.cpp:31
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
int FRead(void *buffer, size_t size, int count)
Definition: serialis.cpp:271

◆ ReadIntTemplates()

INT_TEMPLATES tesseract::Classify::ReadIntTemplates ( TFile fp)

This routine reads a set of integer templates from File. File must already be open and must be in the correct binary format.

Parameters
fpopen file to read templates from
Returns
Pointer to integer templates read from File.
Note
Globals: none

Definition at line 718 of file intproto.cpp.

718  {
719  int i, j, w, x, y, z;
720  int unicharset_size;
721  int version_id = 0;
722  INT_TEMPLATES Templates;
723  CLASS_PRUNER_STRUCT* Pruner;
724  INT_CLASS Class;
725  uint8_t *Lengths;
726  PROTO_SET ProtoSet;
727 
728  /* variables for conversion from older inttemp formats */
729  int b, bit_number, last_cp_bit_number, new_b, new_i, new_w;
730  CLASS_ID class_id, max_class_id;
731  auto *IndexFor = new int16_t[MAX_NUM_CLASSES];
732  auto *ClassIdFor = new CLASS_ID[MAX_NUM_CLASSES];
733  auto **TempClassPruner =
735  uint32_t SetBitsForMask = // word with NUM_BITS_PER_CLASS
736  (1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
737  uint32_t Mask, NewMask, ClassBits;
738  int MaxNumConfigs = MAX_NUM_CONFIGS;
739  int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
740 
741  /* first read the high level template struct */
742  Templates = NewIntTemplates();
743  // Read Templates in parts for 64 bit compatibility.
744  if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1) != 1)
745  tprintf("Bad read of inttemp!\n");
746  if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
747  1) != 1 ||
748  fp->FReadEndian(&Templates->NumClassPruners,
749  sizeof(Templates->NumClassPruners), 1) != 1)
750  tprintf("Bad read of inttemp!\n");
751  if (Templates->NumClasses < 0) {
752  // This file has a version id!
753  version_id = -Templates->NumClasses;
754  if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
755  1) != 1)
756  tprintf("Bad read of inttemp!\n");
757  }
758 
759  if (version_id < 3) {
760  MaxNumConfigs = OLD_MAX_NUM_CONFIGS;
761  WerdsPerConfigVec = OLD_WERDS_PER_CONFIG_VEC;
762  }
763 
764  if (version_id < 2) {
765  if (fp->FReadEndian(IndexFor, sizeof(IndexFor[0]), unicharset_size) !=
766  unicharset_size) {
767  tprintf("Bad read of inttemp!\n");
768  }
769  if (fp->FReadEndian(ClassIdFor, sizeof(ClassIdFor[0]),
770  Templates->NumClasses) != Templates->NumClasses) {
771  tprintf("Bad read of inttemp!\n");
772  }
773  }
774 
775  /* then read in the class pruners */
776  const int kNumBuckets =
778  for (i = 0; i < Templates->NumClassPruners; i++) {
779  Pruner = new CLASS_PRUNER_STRUCT;
780  if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets) !=
781  kNumBuckets) {
782  tprintf("Bad read of inttemp!\n");
783  }
784  if (version_id < 2) {
785  TempClassPruner[i] = Pruner;
786  } else {
787  Templates->ClassPruners[i] = Pruner;
788  }
789  }
790 
791  /* fix class pruners if they came from an old version of inttemp */
792  if (version_id < 2) {
793  // Allocate enough class pruners to cover all the class ids.
794  max_class_id = 0;
795  for (i = 0; i < Templates->NumClasses; i++)
796  if (ClassIdFor[i] > max_class_id)
797  max_class_id = ClassIdFor[i];
798  for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
799  Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
800  memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
801  }
802  // Convert class pruners from the old format (indexed by class index)
803  // to the new format (indexed by class id).
804  last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
805  for (i = 0; i < Templates->NumClassPruners; i++) {
806  for (x = 0; x < NUM_CP_BUCKETS; x++)
807  for (y = 0; y < NUM_CP_BUCKETS; y++)
808  for (z = 0; z < NUM_CP_BUCKETS; z++)
809  for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
810  if (TempClassPruner[i]->p[x][y][z][w] == 0)
811  continue;
812  for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
813  bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
814  if (bit_number > last_cp_bit_number)
815  break; // the rest of the bits in this word are not used
816  class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
817  // Single out NUM_BITS_PER_CLASS bits relating to class_id.
818  Mask = SetBitsForMask << b;
819  ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
820  // Move these bits to the new position in which they should
821  // appear (indexed corresponding to the class_id).
822  new_i = CPrunerIdFor(class_id);
823  new_w = CPrunerWordIndexFor(class_id);
824  new_b = CPrunerBitIndexFor(class_id) * NUM_BITS_PER_CLASS;
825  if (new_b > b) {
826  ClassBits <<= (new_b - b);
827  } else {
828  ClassBits >>= (b - new_b);
829  }
830  // Copy bits relating to class_id to the correct position
831  // in Templates->ClassPruner.
832  NewMask = SetBitsForMask << new_b;
833  Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
834  Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
835  }
836  }
837  }
838  for (i = 0; i < Templates->NumClassPruners; i++) {
839  delete TempClassPruner[i];
840  }
841  }
842 
843  /* then read in each class */
844  for (i = 0; i < Templates->NumClasses; i++) {
845  /* first read in the high level struct for the class */
846  Class = static_cast<INT_CLASS>(Emalloc (sizeof (INT_CLASS_STRUCT)));
847  if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1) != 1 ||
848  fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 ||
849  fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1)
850  tprintf("Bad read of inttemp!\n");
851  if (version_id == 0) {
852  // Only version 0 writes 5 pointless pointers to the file.
853  for (j = 0; j < 5; ++j) {
854  int32_t junk;
855  if (fp->FRead(&junk, sizeof(junk), 1) != 1)
856  tprintf("Bad read of inttemp!\n");
857  }
858  }
859  int num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
860  ASSERT_HOST(num_configs <= MaxNumConfigs);
861  if (fp->FReadEndian(Class->ConfigLengths, sizeof(uint16_t), num_configs) !=
862  num_configs) {
863  tprintf("Bad read of inttemp!\n");
864  }
865  if (version_id < 2) {
866  ClassForClassId (Templates, ClassIdFor[i]) = Class;
867  } else {
868  ClassForClassId (Templates, i) = Class;
869  }
870 
871  /* then read in the proto lengths */
872  Lengths = nullptr;
873  if (MaxNumIntProtosIn (Class) > 0) {
874  Lengths = static_cast<uint8_t *>(Emalloc(sizeof(uint8_t) * MaxNumIntProtosIn(Class)));
875  if (fp->FRead(Lengths, sizeof(uint8_t), MaxNumIntProtosIn(Class)) !=
876  MaxNumIntProtosIn(Class))
877  tprintf("Bad read of inttemp!\n");
878  }
879  Class->ProtoLengths = Lengths;
880 
881  /* then read in the proto sets */
882  for (j = 0; j < Class->NumProtoSets; j++) {
883  ProtoSet = static_cast<PROTO_SET>(Emalloc(sizeof(PROTO_SET_STRUCT)));
884  int num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;
885  if (fp->FReadEndian(&ProtoSet->ProtoPruner,
886  sizeof(ProtoSet->ProtoPruner[0][0][0]),
887  num_buckets) != num_buckets)
888  tprintf("Bad read of inttemp!\n");
889  for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
890  if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A),
891  1) != 1 ||
892  fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B),
893  1) != 1 ||
894  fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C),
895  1) != 1 ||
896  fp->FRead(&ProtoSet->Protos[x].Angle,
897  sizeof(ProtoSet->Protos[x].Angle), 1) != 1)
898  tprintf("Bad read of inttemp!\n");
899  if (fp->FReadEndian(&ProtoSet->Protos[x].Configs,
900  sizeof(ProtoSet->Protos[x].Configs[0]),
901  WerdsPerConfigVec) != WerdsPerConfigVec)
902  cprintf("Bad read of inttemp!\n");
903  }
904  Class->ProtoSets[j] = ProtoSet;
905  }
906  if (version_id < 4) {
907  Class->font_set_id = -1;
908  } else {
909  fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1);
910  }
911  }
912 
913  if (version_id < 2) {
914  /* add an empty nullptr class with class id 0 */
915  assert(UnusedClassIdIn (Templates, 0));
916  ClassForClassId (Templates, 0) = NewIntClass (1, 1);
917  ClassForClassId (Templates, 0)->font_set_id = -1;
918  Templates->NumClasses++;
919  /* make sure the classes are contiguous */
920  for (i = 0; i < MAX_NUM_CLASSES; i++) {
921  if (i < Templates->NumClasses) {
922  if (ClassForClassId (Templates, i) == nullptr) {
923  fprintf(stderr, "Non-contiguous class ids in inttemp\n");
924  exit(1);
925  }
926  } else {
927  if (ClassForClassId (Templates, i) != nullptr) {
928  fprintf(stderr, "Class id %d exceeds NumClassesIn (Templates) %d\n",
929  i, Templates->NumClasses);
930  exit(1);
931  }
932  }
933  }
934  }
935  if (version_id >= 4) {
937  if (version_id >= 5) {
938  this->fontinfo_table_.read(fp,
940  }
942  }
943 
944  // Clean up.
945  delete[] IndexFor;
946  delete[] ClassIdFor;
947  delete[] TempClassPruner;
948 
949  return (Templates);
950 } /* ReadIntTemplates */
uint16_t NumProtos
Definition: intproto.h:106
#define CPrunerIdFor(c)
Definition: intproto.h:180
#define ClassForClassId(T, c)
Definition: intproto.h:178
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
INT_TEMPLATES NewIntTemplates()
Definition: intproto.cpp:682
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:122
#define PROTOS_PER_PROTO_SET
Definition: intproto.h:49
bool read_spacing_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:170
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
#define NUM_PP_BUCKETS
Definition: intproto.h:52
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
INT_PROTO_STRUCT Protos[PROTOS_PER_PROTO_SET]
Definition: intproto.h:97
uint8_t * ProtoLengths
Definition: intproto.h:110
#define WERDS_PER_PP_VECTOR
Definition: intproto.h:63
uint8_t Angle
Definition: intproto.h:85
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
#define MaxNumIntProtosIn(C)
Definition: intproto.h:165
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
uint32_t Configs[WERDS_PER_CONFIG_VEC]
Definition: intproto.h:86
INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs)
Definition: intproto.cpp:626
#define OLD_WERDS_PER_CONFIG_VEC
Definition: intproto.cpp:109
#define WERDS_PER_CONFIG_VEC
Definition: intproto.h:68
bool read_set(TFile *f, FontSet *fs)
Definition: fontinfo.cpp:226
#define BITS_PER_CP_VECTOR
Definition: intproto.h:59
PROTO_SET ProtoSets[MAX_NUM_PROTO_SETS]
Definition: intproto.h:109
#define NUM_BITS_PER_CLASS
Definition: intproto.h:55
PROTO_PRUNER ProtoPruner
Definition: intproto.h:96
#define CPrunerBitIndexFor(c)
Definition: intproto.h:183
#define MAX_NUM_CLASS_PRUNERS
Definition: intproto.h:60
#define UnusedClassIdIn(T, c)
Definition: intproto.h:177
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:111
void * Emalloc(int Size)
Definition: emalloc.cpp:31
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:62
bool read_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:153
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
uint8_t NumConfigs
Definition: intproto.h:108
uint32_t p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR]
Definition: intproto.h:78
#define BITS_PER_WERD
Definition: intproto.h:45
#define NUM_PP_PARAMS
Definition: intproto.h:51
#define CPrunerWordIndexFor(c)
Definition: intproto.h:182
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define NUM_CP_BUCKETS
Definition: intproto.h:53
uint8_t NumProtoSets
Definition: intproto.h:107
#define OLD_MAX_NUM_CONFIGS
Definition: intproto.cpp:108

◆ ReadNewCutoffs()

void tesseract::Classify::ReadNewCutoffs ( TFile fp,
uint16_t *  Cutoffs 
)

Open file, read in all of the class-id/cutoff pairs and insert them into the Cutoffs array. Cutoffs are indexed in the array by class id. Unused entries in the array are set to an arbitrarily high cutoff value.

Parameters
fpfile containing cutoff definitions
Cutoffsarray to put cutoffs into

Definition at line 41 of file cutoffs.cpp.

41  {
42  int Cutoff;
43 
44  if (shape_table_ != nullptr) {
45  if (!shapetable_cutoffs_.DeSerialize(fp)) {
46  tprintf("Error during read of shapetable pffmtable!\n");
47  }
48  }
49  for (int i = 0; i < MAX_NUM_CLASSES; i++)
50  Cutoffs[i] = MAX_CUTOFF;
51 
52  const int kMaxLineSize = 100;
53  char line[kMaxLineSize];
54  while (fp->FGets(line, kMaxLineSize) != nullptr) {
55  std::string Class;
56  CLASS_ID ClassId;
57  std::istringstream stream(line);
58  stream >> Class >> Cutoff;
59  if (stream.fail()) {
60  break;
61  }
62  if (Class.compare("NULL") == 0) {
63  ClassId = unicharset.unichar_to_id(" ");
64  } else {
65  ClassId = unicharset.unichar_to_id(Class.c_str());
66  }
67  ASSERT_HOST(ClassId >= 0 && ClassId < MAX_NUM_CLASSES);
68  Cutoffs[ClassId] = Cutoff;
69  }
70 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
ShapeTable * shape_table_
Definition: classify.h:546
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
#define MAX_CUTOFF
Definition: cutoffs.cpp:30
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
bool DeSerialize(bool swap, FILE *fp)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
UNICHARSET unicharset
Definition: ccutil.h:73
#define ASSERT_HOST(x)
Definition: errcode.h:88

◆ ReadNormProtos()

NORM_PROTOS * tesseract::Classify::ReadNormProtos ( TFile fp)

This routine allocates a new data structure to hold a set of character normalization protos. It then fills in the data structure by reading from the specified File.

Parameters
fpopen text file to read normalization protos from Globals: none
Returns
Character normalization protos.

Definition at line 190 of file normmatch.cpp.

190  {
192  int i;
193  char unichar[2 * UNICHAR_LEN + 1];
194  UNICHAR_ID unichar_id;
195  LIST Protos;
196  int NumProtos;
197 
198  /* allocate and initialization data structure */
199  NormProtos = static_cast<NORM_PROTOS *>(Emalloc (sizeof (NORM_PROTOS)));
201  NormProtos->Protos = static_cast<LIST *>(Emalloc (NormProtos->NumProtos * sizeof(LIST)));
202  for (i = 0; i < NormProtos->NumProtos; i++)
203  NormProtos->Protos[i] = NIL_LIST;
204 
205  /* read file header and save in data structure */
208 
209  /* read protos for each class into a separate list */
210  const int kMaxLineSize = 100;
211  char line[kMaxLineSize];
212  while (fp->FGets(line, kMaxLineSize) != nullptr) {
213  std::istringstream stream(line);
214  stream >> unichar >> NumProtos;
215  if (stream.fail()) {
216  continue;
217  }
218  if (unicharset.contains_unichar(unichar)) {
219  unichar_id = unicharset.unichar_to_id(unichar);
220  Protos = NormProtos->Protos[unichar_id];
221  for (i = 0; i < NumProtos; i++)
222  Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
223  NormProtos->Protos[unichar_id] = Protos;
224  } else {
225  tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
226  unichar);
227  for (i = 0; i < NumProtos; i++)
229  }
230  }
231  return (NormProtos);
232 } /* ReadNormProtos */
int UNICHAR_ID
Definition: unichar.h:34
void FreePrototype(void *arg)
Definition: cluster.cpp:549
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:37
int size() const
Definition: unicharset.h:341
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:140
#define UNICHAR_LEN
Definition: unichar.h:30
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
#define NIL_LIST
Definition: oldlist.h:76
void * Emalloc(int Size)
Definition: emalloc.cpp:31
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:120
UNICHARSET unicharset
Definition: ccutil.h:73
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:227
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:176
NORM_PROTOS * NormProtos
Definition: classify.h:527
LIST * Protos
Definition: normmatch.cpp:38

◆ RefreshDebugWindow()

void tesseract::Classify::RefreshDebugWindow ( ScrollView **  win,
const char *  msg,
int  y_offset,
const TBOX wbox 
)

Definition at line 226 of file adaptmatch.cpp.

227  {
228  #ifndef GRAPHICS_DISABLED
229  const int kSampleSpaceWidth = 500;
230  if (*win == nullptr) {
231  *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
232  kSampleSpaceWidth * 2, 200, true);
233  }
234  (*win)->Clear();
235  (*win)->Pen(64, 64, 64);
236  (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
237  kSampleSpaceWidth, kBlnBaselineOffset);
238  (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
239  kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
240  (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
241  wbox.right(), wbox.bottom());
242  #endif // GRAPHICS_DISABLED
243 }
const int kBlnBaselineOffset
Definition: normalis.h:25
const int kBlnXHeight
Definition: normalis.h:24
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79
int16_t top() const
Definition: rect.h:58

◆ RemoveBadMatches()

void tesseract::Classify::RemoveBadMatches ( ADAPT_RESULTS Results)

This routine steps through each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Parameters
Resultscontains matches to be filtered

Globals:

  • matcher_bad_match_pad defines a "bad match"

Definition at line 2033 of file adaptmatch.cpp.

2033  {
2034  int Next, NextGood;
2035  float BadMatchThreshold;
2036  static const char* romans = "i v x I V X";
2037  BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
2038 
2040  UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
2041  unicharset.unichar_to_id("1") : -1;
2042  UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
2043  unicharset.unichar_to_id("0") : -1;
2044  float scored_one = ScoredUnichar(unichar_id_one, *Results);
2045  float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
2046 
2047  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2048  const UnicharRating& match = Results->match[Next];
2049  if (match.rating >= BadMatchThreshold) {
2050  if (!unicharset.get_isalpha(match.unichar_id) ||
2051  strstr(romans,
2052  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2053  } else if (unicharset.eq(match.unichar_id, "l") &&
2054  scored_one < BadMatchThreshold) {
2055  Results->match[Next].unichar_id = unichar_id_one;
2056  } else if (unicharset.eq(match.unichar_id, "O") &&
2057  scored_zero < BadMatchThreshold) {
2058  Results->match[Next].unichar_id = unichar_id_zero;
2059  } else {
2060  Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
2061  }
2062  if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
2063  if (NextGood == Next) {
2064  ++NextGood;
2065  } else {
2066  Results->match[NextGood++] = Results->match[Next];
2067  }
2068  }
2069  }
2070  }
2071  } else {
2072  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2073  if (Results->match[Next].rating >= BadMatchThreshold) {
2074  if (NextGood == Next) {
2075  ++NextGood;
2076  } else {
2077  Results->match[NextGood++] = Results->match[Next];
2078  }
2079  }
2080  }
2081  }
2082  Results->match.truncate(NextGood);
2083 } /* RemoveBadMatches */
int UNICHAR_ID
Definition: unichar.h:34
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
void truncate(int size)
float best_rating
Definition: adaptmatch.cpp:96
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
bool classify_bln_numeric_mode
Definition: classify.h:508
UNICHARSET unicharset
Definition: ccutil.h:73
int size() const
Definition: genericvector.h:72
double matcher_bad_match_pad
Definition: classify.h:459

◆ RemoveExtraPuncs()

void tesseract::Classify::RemoveExtraPuncs ( ADAPT_RESULTS Results)

This routine discards extra digits or punctuation from the results. We keep only the top 2 punctuation answers and the top 1 digit answer if present.

Parameters
Resultscontains matches to be filtered

Definition at line 2093 of file adaptmatch.cpp.

2093  {
2094  int Next, NextGood;
2095  int punc_count; /*no of garbage characters */
2096  int digit_count;
2097  /*garbage characters */
2098  static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2099  static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2100 
2101  punc_count = 0;
2102  digit_count = 0;
2103  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2104  const UnicharRating& match = Results->match[Next];
2105  bool keep = true;
2106  if (strstr(punc_chars,
2107  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2108  if (punc_count >= 2)
2109  keep = false;
2110  punc_count++;
2111  } else {
2112  if (strstr(digit_chars,
2113  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2114  if (digit_count >= 1)
2115  keep = false;
2116  digit_count++;
2117  }
2118  }
2119  if (keep) {
2120  if (NextGood == Next) {
2121  ++NextGood;
2122  } else {
2123  Results->match[NextGood++] = match;
2124  }
2125  }
2126  }
2127  Results->match.truncate(NextGood);
2128 } /* RemoveExtraPuncs */
void truncate(int size)
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
UNICHARSET unicharset
Definition: ccutil.h:73
int size() const
Definition: genericvector.h:72

◆ ResetAdaptiveClassifierInternal()

void tesseract::Classify::ResetAdaptiveClassifierInternal ( )

Definition at line 598 of file adaptmatch.cpp.

598  {
600  tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
601  NumAdaptationsFailed);
602  }
605  if (BackupAdaptedTemplates != nullptr)
607  BackupAdaptedTemplates = nullptr;
608  NumAdaptationsFailed = 0;
609 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:182
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:519
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:151
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
int classify_learning_debug_level
Definition: classify.h:455

◆ SetAdaptiveThreshold()

void tesseract::Classify::SetAdaptiveThreshold ( float  Threshold)

This routine resets the internal thresholds inside the integer matcher to correspond to the specified threshold.

Parameters
Thresholdthreshold for creating new templates

Globals:

  • matcher_good_threshold default good match rating

Definition at line 2141 of file adaptmatch.cpp.

2141  {
2142  Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
2144  ClipToRange<int>(255 * Threshold, 0, 255));
2146  ClipToRange<int>(255 * Threshold, 0, 255));
2147 } /* SetAdaptiveThreshold */
int classify_adapt_feature_threshold
Definition: classify.h:483
int classify_adapt_proto_threshold
Definition: classify.h:481
double matcher_good_threshold
Definition: classify.h:456

◆ SetStaticClassifier()

void tesseract::Classify::SetStaticClassifier ( ShapeClassifier static_classifier)

Definition at line 211 of file classify.cpp.

211  {
212  delete static_classifier_;
213  static_classifier_ = static_classifier;
214 }

◆ SettupPass1()

void tesseract::Classify::SettupPass1 ( )

This routine prepares the adaptive matcher for the start of the first pass. Learning is enabled (unless it is disabled for the whole program).

Note
this is somewhat redundant, it simply says that if learning is enabled then it will remain enabled on the first pass. If it is disabled, then it will remain disabled. This is only put here to make it very clear that learning is controlled directly by the global setting of EnableLearning.

Globals:

Definition at line 652 of file adaptmatch.cpp.

652  {
654 
656 
657 } /* SettupPass1 */
bool classify_enable_learning
Definition: classify.h:429
virtual Dict & getDict()
Definition: classify.h:107
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:364

◆ SettupPass2()

void tesseract::Classify::SettupPass2 ( )

This routine prepares the adaptive matcher for the start of the second pass. Further learning is disabled.

Globals:

Definition at line 669 of file adaptmatch.cpp.

669  {
670  EnableLearning = false;
672 
673 } /* SettupPass2 */
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:368
virtual Dict & getDict()
Definition: classify.h:107

◆ SetupBLCNDenorms()

void tesseract::Classify::SetupBLCNDenorms ( const TBLOB blob,
bool  nonlinear_norm,
DENORM bl_denorm,
DENORM cn_denorm,
INT_FX_RESULT_STRUCT fx_info 
)
static

Definition at line 129 of file intfx.cpp.

131  {
132  // Compute 1st and 2nd moments of the original outline.
133  FCOORD center, second_moments;
134  int length = blob.ComputeMoments(&center, &second_moments);
135  if (fx_info != nullptr) {
136  fx_info->Length = length;
137  fx_info->Rx = IntCastRounded(second_moments.y());
138  fx_info->Ry = IntCastRounded(second_moments.x());
139 
140  fx_info->Xmean = IntCastRounded(center.x());
141  fx_info->Ymean = IntCastRounded(center.y());
142  }
143  // Setup the denorm for Baseline normalization.
144  bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f,
145  1.0f, 1.0f, 128.0f, 128.0f);
146  // Setup the denorm for character normalization.
147  if (nonlinear_norm) {
150  TBOX box;
151  blob.GetPreciseBoundingBox(&box);
152  box.pad(1, 1);
153  blob.GetEdgeCoords(box, &x_coords, &y_coords);
154  cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX,
155  0.0f, 0.0f, x_coords, y_coords);
156  } else {
157  cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(),
158  center.x(), center.y(),
159  51.2f / second_moments.x(),
160  51.2f / second_moments.y(),
161  128.0f, 128.0f);
162  }
163 }
void GetEdgeCoords(const TBOX &box, GenericVector< GenericVector< int > > *x_coords, GenericVector< GenericVector< int > > *y_coords) const
Definition: blobs.cpp:557
Definition: points.h:188
int16_t Ymean
Definition: intfx.h:37
const DENORM & denorm() const
Definition: blobs.h:363
void pad(int xpad, int ypad)
Definition: rect.h:131
void GetPreciseBoundingBox(TBOX *precise_box) const
Definition: blobs.cpp:541
float y() const
Definition: points.h:210
float x() const
Definition: points.h:207
void SetupNonLinear(const DENORM *predecessor, const TBOX &box, float target_width, float target_height, float final_xshift, float final_yshift, const GenericVector< GenericVector< int > > &x_coords, const GenericVector< GenericVector< int > > &y_coords)
Definition: normalis.cpp:268
Definition: rect.h:34
int32_t Length
Definition: intfx.h:36
int ComputeMoments(FCOORD *center, FCOORD *second_moments) const
Definition: blobs.cpp:522
int16_t Xmean
Definition: intfx.h:37
void SetupNormalization(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift)
Definition: normalis.cpp:96
int IntCastRounded(double x)
Definition: helpers.h:175

◆ shape_table()

const ShapeTable* tesseract::Classify::shape_table ( ) const
inline

Definition at line 111 of file classify.h.

111  {
112  return shape_table_;
113  }
ShapeTable * shape_table_
Definition: classify.h:546

◆ ShapeIDToClassID()

int tesseract::Classify::ShapeIDToClassID ( int  shape_id) const

Definition at line 2220 of file adaptmatch.cpp.

2220  {
2221  for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2222  int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2223  ASSERT_HOST(font_set_id >= 0);
2224  const FontSet &fs = fontset_table_.get(font_set_id);
2225  for (int config = 0; config < fs.size; ++config) {
2226  if (fs.configs[config] == shape_id)
2227  return id;
2228  }
2229  }
2230  tprintf("Shape %d not found\n", shape_id);
2231  return -1;
2232 }
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
#define ASSERT_HOST(x)
Definition: errcode.h:88

◆ ShowBestMatchFor()

void tesseract::Classify::ShowBestMatchFor ( int  shape_id,
const INT_FEATURE_STRUCT features,
int  num_features 
)

This routine displays debug information for the best config of the given shape_id for the given set of features.

Parameters
shape_idclassifier id to work with
featuresfeatures of the unknown character
num_featuresNumber of features in the features array.

Definition at line 2159 of file adaptmatch.cpp.

2161  {
2162 #ifndef GRAPHICS_DISABLED
2163  uint32_t config_mask;
2164  if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2165  tprintf("No built-in templates for class/shape %d\n", shape_id);
2166  return;
2167  }
2168  if (num_features <= 0) {
2169  tprintf("Illegal blob (char norm features)!\n");
2170  return;
2171  }
2172  UnicharRating cn_result;
2173  classify_norm_method.set_value(character);
2176  num_features, features, &cn_result,
2179  tprintf("\n");
2180  config_mask = 1 << cn_result.config;
2181 
2182  tprintf("Static Shape ID: %d\n", shape_id);
2183  ShowMatchDisplay();
2185  &config_mask, num_features, features, &cn_result,
2189 #endif // GRAPHICS_DISABLED
2190 } /* ShowBestMatchFor */
#define ClassForClassId(T, c)
Definition: intproto.h:178
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int classify_adapt_feature_threshold
Definition: classify.h:483
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:511
IntegerMatcher im_
Definition: classify.h:540
void UpdateMatchDisplay()
Definition: intproto.cpp:447
BIT_VECTOR AllProtosOn
Definition: classify.h:522
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:511
#define UnusedClassIdIn(T, c)
Definition: intproto.h:177
bool matcher_debug_separate_windows
Definition: classify.h:494
#define NO_DEBUG
Definition: adaptmatch.cpp:79
BIT_VECTOR AllConfigsOn
Definition: classify.h:523

◆ ShowMatchDisplay()

void tesseract::Classify::ShowMatchDisplay ( )

This routine sends the shapes in the global display lists to the match debugger window.

Globals:

  • FeatureShapes display list containing feature matches
  • ProtoShapes display list containing proto matches

Definition at line 962 of file intproto.cpp.

962  {
964  if (ProtoDisplayWindow) {
965  ProtoDisplayWindow->Clear();
966  }
967  if (FeatureDisplayWindow) {
968  FeatureDisplayWindow->Clear();
969  }
971  static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),
972  IntMatchWindow);
973  IntMatchWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
975  if (ProtoDisplayWindow) {
976  ProtoDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
978  }
979  if (FeatureDisplayWindow) {
980  FeatureDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
982  }
983 } /* ShowMatchDisplay */
#define INT_MIN_Y
Definition: intproto.cpp:60
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:987
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:757
void InitIntMatchWindowIfReqd()
Definition: intproto.cpp:1722
#define INT_MAX_Y
Definition: intproto.cpp:62
#define INT_MIN_X
Definition: intproto.cpp:59
void Clear()
Definition: scrollview.cpp:589
#define INT_MAX_X
Definition: intproto.cpp:61

◆ StartBackupAdaptiveClassifier()

void tesseract::Classify::StartBackupAdaptiveClassifier ( )

Definition at line 629 of file adaptmatch.cpp.

629  {
630  if (BackupAdaptedTemplates != nullptr)
633 }
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:182
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:519
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:151

◆ SwitchAdaptiveClassifier()

void tesseract::Classify::SwitchAdaptiveClassifier ( )

Definition at line 613 of file adaptmatch.cpp.

613  {
614  if (BackupAdaptedTemplates == nullptr) {
616  return;
617  }
619  tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
620  NumAdaptationsFailed);
621  }
624  BackupAdaptedTemplates = nullptr;
625  NumAdaptationsFailed = 0;
626 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:182
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:519
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:598
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
int classify_learning_debug_level
Definition: classify.h:455

◆ TempConfigReliable()

bool tesseract::Classify::TempConfigReliable ( CLASS_ID  class_id,
const TEMP_CONFIG config 
)

Definition at line 2236 of file adaptmatch.cpp.

2237  {
2238  if (classify_learning_debug_level >= 1) {
2239  tprintf("NumTimesSeen for config of %s is %d\n",
2240  getDict().getUnicharset().debug_str(class_id).string(),
2241  config->NumTimesSeen);
2242  }
2244  return true;
2245  } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
2246  return false;
2247  } else if (use_ambigs_for_adaption) {
2248  // Go through the ambigs vector and see whether we have already seen
2249  // enough times all the characters represented by the ambigs vector.
2250  const UnicharIdVector *ambigs =
2252  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2253  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2254  ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2255  assert(ambig_class != nullptr);
2256  if (ambig_class->NumPermConfigs == 0 &&
2257  ambig_class->MaxNumTimesSeen <
2259  if (classify_learning_debug_level >= 1) {
2260  tprintf("Ambig %s has not been seen enough times,"
2261  " not making config for %s permanent\n",
2262  getDict().getUnicharset().debug_str(
2263  (*ambigs)[ambig]).string(),
2264  getDict().getUnicharset().debug_str(class_id).string());
2265  }
2266  return false;
2267  }
2268  }
2269  }
2270  return true;
2271 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:35
uint8_t NumTimesSeen
Definition: adaptive.h:36
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
uint8_t MaxNumTimesSeen
Definition: adaptive.h:57
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
virtual Dict & getDict()
Definition: classify.h:107
bool use_ambigs_for_adaption
Definition: ccutil.h:89
int classify_learning_debug_level
Definition: classify.h:455
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108
uint8_t NumPermConfigs
Definition: adaptive.h:56
int size() const
Definition: genericvector.h:72
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:183
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:466
int matcher_min_examples_for_prototyping
Definition: classify.h:464

◆ UpdateAmbigsGroup()

void tesseract::Classify::UpdateAmbigsGroup ( CLASS_ID  class_id,
TBLOB Blob 
)

Definition at line 2273 of file adaptmatch.cpp.

2273  {
2274  const UnicharIdVector *ambigs =
2276  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2277  if (classify_learning_debug_level >= 1) {
2278  tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2279  getDict().getUnicharset().debug_str(class_id).string(), class_id);
2280  }
2281  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2282  CLASS_ID ambig_class_id = (*ambigs)[ambig];
2283  const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2284  for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2285  if (ConfigIsPermanent(ambigs_class, cfg)) continue;
2286  const TEMP_CONFIG config =
2287  TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2288  if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2289  if (classify_learning_debug_level >= 1) {
2290  tprintf("Making config %d of %s permanent\n", cfg,
2291  getDict().getUnicharset().debug_str(
2292  ambig_class_id).string());
2293  }
2294  MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2295  }
2296  }
2297  }
2298 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:35
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:82
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
virtual Dict & getDict()
Definition: classify.h:107
int classify_learning_debug_level
Definition: classify.h:455
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108
int size() const
Definition: genericvector.h:72
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:192

◆ WriteAdaptedTemplates()

void tesseract::Classify::WriteAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine saves Templates to File in a binary format.

Parameters
Fileopen text file to write Templates to
Templatesset of adapted templates to write to File
Note
Globals: none

Definition at line 453 of file adaptive.cpp.

453  {
454  int i;
455 
456  /* first write the high level adaptive template struct */
457  fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);
458 
459  /* then write out the basic integer templates */
460  WriteIntTemplates (File, Templates->Templates, unicharset);
461 
462  /* then write out the adaptive info for each class */
463  for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
464  WriteAdaptedClass (File, Templates->Class[i],
465  Templates->Templates->Class[i]->NumConfigs);
466  }
467 } /* WriteAdaptedTemplates */
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1017
INT_TEMPLATES Templates
Definition: adaptive.h:67
void WriteAdaptedClass(FILE *File, ADAPT_CLASS Class, int NumConfigs)
Definition: adaptive.cpp:409
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
uint8_t NumConfigs
Definition: intproto.h:108
UNICHARSET unicharset
Definition: ccutil.h:73

◆ WriteIntTemplates()

void tesseract::Classify::WriteIntTemplates ( FILE *  File,
INT_TEMPLATES  Templates,
const UNICHARSET target_unicharset 
)

This routine writes Templates to File. The format is an efficient binary format. File must already be open for writing.

Parameters
Fileopen file to write templates to
Templatestemplates to save into File
target_unicharsetthe UNICHARSET to use

Definition at line 1017 of file intproto.cpp.

1018  {
1019  int i, j;
1020  INT_CLASS Class;
1021  int unicharset_size = target_unicharset.size();
1022  int version_id = -5; // When negated by the reader -1 becomes +1 etc.
1023 
1024  if (Templates->NumClasses != unicharset_size) {
1025  cprintf("Warning: executing WriteIntTemplates() with %d classes in"
1026  " Templates, while target_unicharset size is %d\n",
1027  Templates->NumClasses, unicharset_size);
1028  }
1029 
1030  /* first write the high level template struct */
1031  fwrite(&unicharset_size, sizeof(unicharset_size), 1, File);
1032  fwrite(&version_id, sizeof(version_id), 1, File);
1033  fwrite(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners),
1034  1, File);
1035  fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);
1036 
1037  /* then write out the class pruners */
1038  for (i = 0; i < Templates->NumClassPruners; i++)
1039  fwrite(Templates->ClassPruners[i],
1040  sizeof(CLASS_PRUNER_STRUCT), 1, File);
1041 
1042  /* then write out each class */
1043  for (i = 0; i < Templates->NumClasses; i++) {
1044  Class = Templates->Class[i];
1045 
1046  /* first write out the high level struct for the class */
1047  fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
1048  fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
1049  ASSERT_HOST(Class->NumConfigs == this->fontset_table_.get(Class->font_set_id).size);
1050  fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
1051  for (j = 0; j < Class->NumConfigs; ++j) {
1052  fwrite(&Class->ConfigLengths[j], sizeof(uint16_t), 1, File);
1053  }
1054 
1055  /* then write out the proto lengths */
1056  if (MaxNumIntProtosIn (Class) > 0) {
1057  fwrite(Class->ProtoLengths, sizeof(uint8_t),
1058  MaxNumIntProtosIn(Class), File);
1059  }
1060 
1061  /* then write out the proto sets */
1062  for (j = 0; j < Class->NumProtoSets; j++)
1063  fwrite(Class->ProtoSets[j], sizeof(PROTO_SET_STRUCT), 1, File);
1064 
1065  /* then write the fonts info */
1066  fwrite(&Class->font_set_id, sizeof(int), 1, File);
1067  }
1068 
1069  /* Write the fonts info tables */
1071  this->fontinfo_table_.write(File,
1074 } /* WriteIntTemplates */
uint16_t NumProtos
Definition: intproto.h:106
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
int size() const
Definition: unicharset.h:341
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:122
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
bool write_spacing_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:198
uint8_t * ProtoLengths
Definition: intproto.h:110
#define MaxNumIntProtosIn(C)
Definition: intproto.h:165
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
bool write_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:163
PROTO_SET ProtoSets[MAX_NUM_PROTO_SETS]
Definition: intproto.h:109
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:111
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
uint8_t NumConfigs
Definition: intproto.h:108
bool write_set(FILE *f, const FontSet &fs)
Definition: fontinfo.cpp:232
#define ASSERT_HOST(x)
Definition: errcode.h:88
uint8_t NumProtoSets
Definition: intproto.h:107

◆ WriteTRFile()

bool tesseract::Classify::WriteTRFile ( const STRING filename)

Definition at line 98 of file blobclass.cpp.

98  {
99  bool result = false;
100  STRING tr_filename = filename + ".tr";
101  FILE* fp = fopen(tr_filename.string(), "wb");
102  if (fp) {
103  result =
104  tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length());
105  fclose(fp);
106  }
107  tr_file_data_.truncate_at(0);
108  return result;
109 }
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
const char * string() const
Definition: strngs.cpp:194
int32_t length() const
Definition: strngs.cpp:189
void truncate_at(int32_t index)
Definition: strngs.cpp:265
Definition: strngs.h:45

Member Data Documentation

◆ AdaptedTemplates

ADAPT_TEMPLATES tesseract::Classify::AdaptedTemplates

Definition at line 515 of file classify.h.

◆ AllConfigsOff

BIT_VECTOR tesseract::Classify::AllConfigsOff

Definition at line 524 of file classify.h.

◆ AllConfigsOn

BIT_VECTOR tesseract::Classify::AllConfigsOn

Definition at line 523 of file classify.h.

◆ allow_blob_division

bool tesseract::Classify::allow_blob_division = true

"Use divisible blobs chopping"

Definition at line 423 of file classify.h.

◆ AllProtosOn

BIT_VECTOR tesseract::Classify::AllProtosOn

Definition at line 522 of file classify.h.

◆ BackupAdaptedTemplates

ADAPT_TEMPLATES tesseract::Classify::BackupAdaptedTemplates

Definition at line 519 of file classify.h.

◆ certainty_scale

double tesseract::Classify::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 473 of file classify.h.

◆ classify_adapt_feature_threshold

int tesseract::Classify::classify_adapt_feature_threshold = 230

"Threshold for good features during adaptive 0-255"

Definition at line 483 of file classify.h.

◆ classify_adapt_proto_threshold

int tesseract::Classify::classify_adapt_proto_threshold = 230

"Threshold for good protos during adaptive 0-255"

Definition at line 481 of file classify.h.

◆ classify_adapted_pruning_factor

double tesseract::Classify::classify_adapted_pruning_factor = 2.5

"Prune poor adapted results this much worse than best result"

Definition at line 477 of file classify.h.

◆ classify_adapted_pruning_threshold

double tesseract::Classify::classify_adapted_pruning_threshold = -1.0

"Threshold at which classify_adapted_pruning_factor starts"

Definition at line 479 of file classify.h.

◆ classify_bln_numeric_mode

bool tesseract::Classify::classify_bln_numeric_mode = 0

"Assume the input is numbers [0-9]."

Definition at line 508 of file classify.h.

◆ classify_char_norm_range

double tesseract::Classify::classify_char_norm_range = 0.2

"Character Normalization Range ..."

Definition at line 436 of file classify.h.

◆ classify_character_fragments_garbage_certainty_threshold

double tesseract::Classify::classify_character_fragments_garbage_certainty_threshold = -3.0

"Exclude fragments that do not match any whole character" " with at least this certainty"

Definition at line 489 of file classify.h.

◆ classify_class_pruner_multiplier

int tesseract::Classify::classify_class_pruner_multiplier = 15

"Class Pruner Multiplier 0-255: "

Definition at line 501 of file classify.h.

◆ classify_class_pruner_threshold

int tesseract::Classify::classify_class_pruner_threshold = 229

"Class Pruner Threshold 0-255"

Definition at line 499 of file classify.h.

◆ classify_cp_cutoff_strength

int tesseract::Classify::classify_cp_cutoff_strength = 7

"Class Pruner CutoffStrength: "

Definition at line 503 of file classify.h.

◆ classify_debug_character_fragments

bool tesseract::Classify::classify_debug_character_fragments = false

"Bring up graphical debugging windows for fragments training"

Definition at line 491 of file classify.h.

◆ classify_debug_level

int tesseract::Classify::classify_debug_level = 0

"Classify debug level"

Definition at line 430 of file classify.h.

◆ classify_enable_adaptive_debugger

bool tesseract::Classify::classify_enable_adaptive_debugger = 0

"Enable match debugger"

Definition at line 450 of file classify.h.

◆ classify_enable_adaptive_matcher

bool tesseract::Classify::classify_enable_adaptive_matcher = 1

"Enable adaptive classifier"

Definition at line 445 of file classify.h.

◆ classify_enable_learning

bool tesseract::Classify::classify_enable_learning = true

"Enable adaptive classifier"

Definition at line 429 of file classify.h.

◆ classify_integer_matcher_multiplier

int tesseract::Classify::classify_integer_matcher_multiplier = 10

"Integer Matcher Multiplier 0-255: "

Definition at line 505 of file classify.h.

◆ classify_learn_debug_str

char* tesseract::Classify::classify_learn_debug_str = ""

"Class str to debug learning"

Definition at line 495 of file classify.h.

◆ classify_learning_debug_level

int tesseract::Classify::classify_learning_debug_level = 0

"Learning Debug Level: "

Definition at line 455 of file classify.h.

◆ classify_max_certainty_margin

double tesseract::Classify::classify_max_certainty_margin = 5.5

"Veto difference between classifier certainties"

Definition at line 440 of file classify.h.

◆ classify_max_rating_ratio

double tesseract::Classify::classify_max_rating_ratio = 1.5

"Veto ratio between classifier ratings"

Definition at line 438 of file classify.h.

◆ classify_misfit_junk_penalty

double tesseract::Classify::classify_misfit_junk_penalty = 0.0

"Penalty to apply when a non-alnum is vertically out of " "its expected textline position"

Definition at line 471 of file classify.h.

◆ classify_nonlinear_norm

bool tesseract::Classify::classify_nonlinear_norm = 0

"Non-linear stroke-density normalization"

Definition at line 452 of file classify.h.

◆ classify_norm_method

int tesseract::Classify::classify_norm_method = character

"Normalization Method ..."

Definition at line 434 of file classify.h.

◆ classify_save_adapted_templates

bool tesseract::Classify::classify_save_adapted_templates = 0

"Save adapted templates to a file"

Definition at line 449 of file classify.h.

◆ classify_use_pre_adapted_templates

bool tesseract::Classify::classify_use_pre_adapted_templates = 0

"Use pre-adapted classifier templates"

Definition at line 447 of file classify.h.

◆ disable_character_fragments

bool tesseract::Classify::disable_character_fragments = true

"Do not include character fragments in the" " results of the classifier"

Definition at line 486 of file classify.h.

◆ EnableLearning

bool tesseract::Classify::EnableLearning

Definition at line 577 of file classify.h.

◆ feature_defs_

FEATURE_DEFS_STRUCT tesseract::Classify::feature_defs_
protected

Definition at line 541 of file classify.h.

◆ fontinfo_table_

UnicityTable<FontInfo> tesseract::Classify::fontinfo_table_

Definition at line 529 of file classify.h.

◆ fontset_table_

UnicityTable<FontSet> tesseract::Classify::fontset_table_

Definition at line 537 of file classify.h.

◆ im_

IntegerMatcher tesseract::Classify::im_
protected

Definition at line 540 of file classify.h.

◆ matcher_avg_noise_size

double tesseract::Classify::matcher_avg_noise_size = 12.0

"Avg. noise blob length: "

Definition at line 461 of file classify.h.

◆ matcher_bad_match_pad

double tesseract::Classify::matcher_bad_match_pad = 0.15

"Bad Match Pad (0-1)"

Definition at line 459 of file classify.h.

◆ matcher_clustering_max_angle_delta

double tesseract::Classify::matcher_clustering_max_angle_delta = 0.015

"Maximum angle delta for prototype clustering"

Definition at line 468 of file classify.h.

◆ matcher_debug_flags

int tesseract::Classify::matcher_debug_flags = 0

"Matcher Debug Flags"

Definition at line 454 of file classify.h.

◆ matcher_debug_level

int tesseract::Classify::matcher_debug_level = 0

"Matcher Debug Level"

Definition at line 453 of file classify.h.

◆ matcher_debug_separate_windows

bool tesseract::Classify::matcher_debug_separate_windows = false

"Use two different windows for debugging the matching: " "One for the protos and one for the features."

Definition at line 494 of file classify.h.

◆ matcher_good_threshold

double tesseract::Classify::matcher_good_threshold = 0.125

"Good Match (0-1)"

Definition at line 456 of file classify.h.

◆ matcher_min_examples_for_prototyping

int tesseract::Classify::matcher_min_examples_for_prototyping = 3

"Reliable Config Threshold"

Definition at line 464 of file classify.h.

◆ matcher_perfect_threshold

double tesseract::Classify::matcher_perfect_threshold = 0.02

"Perfect Match (0-1)"

Definition at line 458 of file classify.h.

◆ matcher_permanent_classes_min

int tesseract::Classify::matcher_permanent_classes_min = 1

"Min # of permanent classes"

Definition at line 462 of file classify.h.

◆ matcher_rating_margin

double tesseract::Classify::matcher_rating_margin = 0.1

"New template margin (0-1)"

Definition at line 460 of file classify.h.

◆ matcher_reliable_adaptive_result

double tesseract::Classify::matcher_reliable_adaptive_result = 0.0

"Great Match (0-1)"

Definition at line 457 of file classify.h.

◆ matcher_sufficient_examples_for_prototyping

int tesseract::Classify::matcher_sufficient_examples_for_prototyping = 5

"Enable adaption even if the ambiguities have not been seen"

Definition at line 466 of file classify.h.

◆ NormProtos

NORM_PROTOS* tesseract::Classify::NormProtos

Definition at line 527 of file classify.h.

◆ PreTrainedTemplates

INT_TEMPLATES tesseract::Classify::PreTrainedTemplates

Definition at line 511 of file classify.h.

◆ prioritize_division

bool tesseract::Classify::prioritize_division = false

"Prioritize blob division over chopping"

Definition at line 428 of file classify.h.

◆ rating_scale

double tesseract::Classify::rating_scale = 1.5

"Rating scaling factor"

Definition at line 472 of file classify.h.

◆ shape_table_

ShapeTable* tesseract::Classify::shape_table_
protected

Definition at line 546 of file classify.h.

◆ speckle_large_max_size

double tesseract::Classify::speckle_large_max_size = 0.30

"Max large speckle size"

Definition at line 509 of file classify.h.

◆ speckle_rating_penalty

double tesseract::Classify::speckle_rating_penalty = 10.0

"Penalty to add to worst rating for noise"

Definition at line 511 of file classify.h.

◆ TempProtoMask

BIT_VECTOR tesseract::Classify::TempProtoMask

Definition at line 525 of file classify.h.

◆ tess_bn_matching

bool tesseract::Classify::tess_bn_matching = 0

"Baseline Normalized Matching"

Definition at line 444 of file classify.h.

◆ tess_cn_matching

bool tesseract::Classify::tess_cn_matching = 0

"Character Normalized Matching"

Definition at line 443 of file classify.h.

◆ tessedit_class_miss_scale

double tesseract::Classify::tessedit_class_miss_scale = 0.00390625

"Scale factor for features not used"

Definition at line 475 of file classify.h.


The documentation for this class was generated from the following files: