tesseract  3.04.00
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Public Attributes

int language_model_debug_level = 0
 
bool language_model_ngram_on = false
 
int language_model_ngram_order = 8
 
int language_model_viterbi_list_max_num_prunable = 10
 
int language_model_viterbi_list_max_size = 500
 
double language_model_ngram_small_prob = 0.000001
 
double language_model_ngram_nonmatch_score = -40.0
 
bool language_model_ngram_use_only_first_uft8_step = false
 
double language_model_ngram_scale_factor = 0.03
 
double language_model_ngram_rating_factor = 16.0
 
bool language_model_ngram_space_delimited_language = true
 
int language_model_min_compound_length = 3
 
double language_model_penalty_non_freq_dict_word = 0.1
 
double language_model_penalty_non_dict_word = 0.15
 
double language_model_penalty_punc = 0.2
 
double language_model_penalty_case = 0.1
 
double language_model_penalty_script = 0.5
 
double language_model_penalty_chartype = 0.3
 
double language_model_penalty_font = 0.00
 
double language_model_penalty_spacing = 0.05
 
double language_model_penalty_increment = 0.01
 
int wordrec_display_segmentations = 0
 
bool language_model_use_sigmoidal_certainty = false
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgsdawg_args_
 
float rating_cert_scale_
 
const UnicityTable< FontInfo > * fontinfo_table_
 
Dictdict_
 
bool fixed_pitch_
 
float max_char_wh_ratio_
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_
 
DawgPositionVectorvery_beginning_active_dawgs_
 
DawgPositionVectorbeginning_active_dawgs_
 
bool acceptable_choice_found_
 
bool correct_segmentation_explored_
 
ParamsModel params_model_
 

Detailed Description

Definition at line 42 of file language_model.h.

Constructor & Destructor Documentation

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 45 of file language_model.cpp.

47  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
48  dict->getCCUtil()->params()),
50  "Turn on/off the use of character ngram model",
51  dict->getCCUtil()->params()),
53  "Maximum order of the character ngram model",
54  dict->getCCUtil()->params()),
56  "Maximum number of prunable (those for which"
57  " PrunablePath() is true) entries in each viterbi list"
58  " recorded in BLOB_CHOICEs",
59  dict->getCCUtil()->params()),
61  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
62  dict->getCCUtil()->params()),
64  "To avoid overly small denominators use this as the "
65  "floor of the probability returned by the ngram model.",
66  dict->getCCUtil()->params()),
68  "Average classifier score of a non-matching unichar.",
69  dict->getCCUtil()->params()),
71  "Use only the first UTF8 step of the given string"
72  " when computing log probabilities.",
73  dict->getCCUtil()->params()),
75  "Strength of the character ngram model relative to the"
76  " character classifier ",
77  dict->getCCUtil()->params()),
79  "Factor to bring log-probs into the same range as ratings"
80  " when multiplied by outline length ",
81  dict->getCCUtil()->params()),
83  "Words are delimited by space",
84  dict->getCCUtil()->params()),
86  "Minimum length of compound words",
87  dict->getCCUtil()->params()),
89  "Penalty for words not in the frequent word dictionary",
90  dict->getCCUtil()->params()),
92  "Penalty for non-dictionary words",
93  dict->getCCUtil()->params()),
95  "Penalty for inconsistent punctuation",
96  dict->getCCUtil()->params()),
98  "Penalty for inconsistent case",
99  dict->getCCUtil()->params()),
101  "Penalty for inconsistent script",
102  dict->getCCUtil()->params()),
104  "Penalty for inconsistent character type",
105  dict->getCCUtil()->params()),
106  // TODO(daria, rays): enable font consistency checking
107  // after improving font analysis.
109  "Penalty for inconsistent font",
110  dict->getCCUtil()->params()),
112  "Penalty for inconsistent spacing",
113  dict->getCCUtil()->params()),
115  "Penalty increment",
116  dict->getCCUtil()->params()),
117  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
118  dict->getCCUtil()->params()),
120  "Use sigmoidal score for certainty",
121  dict->getCCUtil()->params()),
122  fontinfo_table_(fontinfo_table), dict_(dict),
123  fixed_pitch_(false), max_char_wh_ratio_(0.0),
124  acceptable_choice_found_(false) {
125  ASSERT_HOST(dict_ != NULL);
126  dawg_args_ = new DawgArgs(NULL, new DawgPositionVector(), NO_PERM);
127  very_beginning_active_dawgs_ = new DawgPositionVector();
128  beginning_active_dawgs_ = new DawgPositionVector();
129 }
DawgPositionVector * beginning_active_dawgs_
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
#define NULL
Definition: host.h:144
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
bool language_model_ngram_use_only_first_uft8_step
int language_model_viterbi_list_max_num_prunable
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
DawgPositionVector * very_beginning_active_dawgs_
bool language_model_ngram_space_delimited_language
double language_model_ngram_nonmatch_score
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
const UnicityTable< FontInfo > * fontinfo_table_
double language_model_penalty_non_freq_dict_word
double language_model_penalty_non_dict_word
#define ASSERT_HOST(x)
Definition: errcode.h:84
tesseract::LanguageModel::~LanguageModel ( )

Definition at line 131 of file language_model.cpp.

131  {
134  delete dawg_args_->updated_dawgs;
135  delete dawg_args_;
136 }
DawgPositionVector * beginning_active_dawgs_
DawgPositionVector * updated_dawgs
Definition: dict.h:82
DawgPositionVector * very_beginning_active_dawgs_

Member Function Documentation

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 95 of file language_model.h.

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 301 of file language_model.h.

301  {
302  return (vse.dawg_info != NULL || vse.Consistent() ||
303  (vse.ngram_info != NULL && !vse.ngram_info->pruned));
304  }
#define NULL
Definition: host.h:144
bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 548 of file language_model.cpp.

559  {
560  ViterbiStateEntry_IT vit;
561  if (language_model_debug_level > 1) {
562  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
563  " certainty=%.4f top_choice_flags=0x%x",
565  b->rating(), b->certainty(), top_choice_flags);
567  tprintf(" parent_vse=%p\n", parent_vse);
568  else
569  tprintf("\n");
570  }
571  // Check whether the list is full.
572  if (curr_state != NULL &&
573  curr_state->viterbi_state_entries_length >=
575  if (language_model_debug_level > 1) {
576  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
577  }
578  return false;
579  }
580 
581  // Invoke Dawg language model component.
582  LanguageModelDawgInfo *dawg_info =
583  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
584 
585  float outline_length =
587  // Invoke Ngram language model component.
588  LanguageModelNgramInfo *ngram_info = NULL;
590  ngram_info = GenerateNgramInfo(
592  denom, curr_col, curr_row, outline_length, parent_vse);
593  ASSERT_HOST(ngram_info != NULL);
594  }
595  bool liked_by_language_model = dawg_info != NULL ||
596  (ngram_info != NULL && !ngram_info->pruned);
597  // Quick escape if not liked by the language model, can't be consistent
598  // xheight, and not top choice.
599  if (!liked_by_language_model && top_choice_flags == 0) {
600  if (language_model_debug_level > 1) {
601  tprintf("Language model components very early pruned this entry\n");
602  }
603  delete ngram_info;
604  delete dawg_info;
605  return false;
606  }
607 
608  // Check consistency of the path and set the relevant consistency_info.
609  LMConsistencyInfo consistency_info(
610  parent_vse != NULL ? &parent_vse->consistency_info : NULL);
611  // Start with just the x-height consistency, as it provides significant
612  // pruning opportunity.
613  consistency_info.ComputeXheightConsistency(
615  // Turn off xheight consistent flag if not consistent.
616  if (consistency_info.InconsistentXHeight()) {
617  top_choice_flags &= ~kXhtConsistentFlag;
618  }
619 
620  // Quick escape if not liked by the language model, not consistent xheight,
621  // and not top choice.
622  if (!liked_by_language_model && top_choice_flags == 0) {
623  if (language_model_debug_level > 1) {
624  tprintf("Language model components early pruned this entry\n");
625  }
626  delete ngram_info;
627  delete dawg_info;
628  return false;
629  }
630 
631  // Compute the rest of the consistency info.
632  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
633  word_res, &consistency_info);
634  if (dawg_info != NULL && consistency_info.invalid_punc) {
635  consistency_info.invalid_punc = false; // do not penalize dict words
636  }
637 
638  // Compute cost of associating the blobs that represent the current unichar.
639  AssociateStats associate_stats;
640  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
641  parent_vse, word_res, &associate_stats);
642  if (parent_vse != NULL) {
643  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
644  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
645  }
646 
647  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
648  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
649  parent_vse, b, 0.0, outline_length,
650  consistency_info, associate_stats, top_choice_flags, dawg_info,
651  ngram_info, (language_model_debug_level > 0) ?
653  new_vse->cost = ComputeAdjustedPathCost(new_vse);
655  tprintf("Adjusted cost = %g\n", new_vse->cost);
656 
657  // Invoke Top Choice language model component to make the final adjustments
658  // to new_vse->top_choice_flags.
659  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
660  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
661  }
662 
663  // If language model components did not like this unichar - return.
664  bool keep = new_vse->top_choice_flags || liked_by_language_model;
665  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
666  consistency_info.inconsistent_script) { // with inconsistent script
667  keep = false;
668  }
669  if (!keep) {
670  if (language_model_debug_level > 1) {
671  tprintf("Language model components did not like this entry\n");
672  }
673  delete new_vse;
674  return false;
675  }
676 
677  // Discard this entry if it represents a prunable path and
678  // language_model_viterbi_list_max_num_prunable such entries with a lower
679  // cost have already been recorded.
680  if (PrunablePath(*new_vse) &&
681  (curr_state->viterbi_state_entries_prunable_length >=
683  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
684  if (language_model_debug_level > 1) {
685  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
686  new_vse->cost,
687  curr_state->viterbi_state_entries_prunable_max_cost);
688  }
689  delete new_vse;
690  return false;
691  }
692 
693  // Update best choice if needed.
694  if (word_end) {
695  UpdateBestChoice(new_vse, pain_points, word_res,
696  best_choice_bundle, blamer_bundle);
697  // Discard the entry if UpdateBestChoice() found flaws in it.
698  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
699  new_vse != best_choice_bundle->best_vse) {
700  if (language_model_debug_level > 1) {
701  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
702  }
703  delete new_vse;
704  return false;
705  }
706  }
707 
708  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
709  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
710  false, new_vse);
711  curr_state->viterbi_state_entries_length++;
712  if (PrunablePath(*new_vse)) {
713  curr_state->viterbi_state_entries_prunable_length++;
714  }
715 
716  // Update lms->viterbi_state_entries_prunable_max_cost and clear
717  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
718  if ((curr_state->viterbi_state_entries_prunable_length >=
720  new_vse->top_choice_flags) {
721  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
722  int prunable_counter = language_model_viterbi_list_max_num_prunable;
723  vit.set_to_list(&(curr_state->viterbi_state_entries));
724  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
725  ViterbiStateEntry *curr_vse = vit.data();
726  // Clear the appropriate top choice flags of the entries in the
727  // list that have cost higher thank new_entry->cost
728  // (since they will not be top choices any more).
729  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
730  curr_vse->cost > new_vse->cost) {
731  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
732  }
733  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
734  // Update curr_state->viterbi_state_entries_prunable_max_cost.
735  if (prunable_counter == 0) {
736  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
737  if (language_model_debug_level > 1) {
738  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
739  curr_state->viterbi_state_entries_prunable_max_cost);
740  }
741  prunable_counter = -1; // stop counting
742  }
743  }
744  }
745 
746  // Print the newly created ViterbiStateEntry.
747  if (language_model_debug_level > 2) {
748  new_vse->Print("New");
750  curr_state->Print("Updated viterbi list");
751  }
752 
753  return true;
754 }
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
#define tprintf(...)
Definition: tprintf.h:31
static const LanguageModelFlagsType kXhtConsistentFlag
#define NULL
Definition: host.h:144
bool PrunablePath(const ViterbiStateEntry &vse)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:127
static const float kBadRating
Definition: ratngs.h:273
int language_model_viterbi_list_max_num_prunable
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
float rating() const
Definition: ratngs.h:79
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
static const LanguageModelFlagsType kSmallestRatingFlag
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
#define ASSERT_HOST(x)
Definition: errcode.h:84
float certainty() const
Definition: ratngs.h:82
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:82
float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 104 of file language_model.h.

104  {
106  // cert is assumed to be between 0 and -dict_->certainty_scale.
107  // If you enable language_model_use_sigmoidal_certainty, you
108  // need to adjust language_model_ngram_nonmatch_score as well.
109  cert = -cert / dict_->certainty_scale;
110  return 1.0f / (1.0f + exp(10.0f * cert));
111  } else {
112  return (-1.0f / cert);
113  }
114  }
double certainty_scale
Definition: dict.h:601
float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

Definition at line 1183 of file language_model.cpp.

1183  {
1184  ASSERT_HOST(vse != NULL);
1185  if (params_model_.Initialized()) {
1186  float features[PTRAIN_NUM_FEATURE_TYPES];
1187  ExtractFeaturesFromPath(*vse, features);
1188  float cost = params_model_.ComputeCost(features);
1189  if (language_model_debug_level > 3) {
1190  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1191  if (language_model_debug_level >= 5) {
1192  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1193  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1194  }
1195  }
1196  }
1197  return cost * vse->outline_length;
1198  } else {
1199  float adjustment = 1.0f;
1200  if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1202  }
1203  if (vse->dawg_info == NULL) {
1205  if (vse->length > language_model_min_compound_length) {
1206  adjustment += ((vse->length - language_model_min_compound_length) *
1208  }
1209  }
1210  if (vse->associate_stats.shape_cost > 0) {
1211  adjustment += vse->associate_stats.shape_cost /
1212  static_cast<float>(vse->length);
1213  }
1215  ASSERT_HOST(vse->ngram_info != NULL);
1216  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1217  } else {
1218  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1219  vse->consistency_info);
1220  return vse->ratings_sum * adjustment;
1221  }
1222  }
1223 }
#define tprintf(...)
Definition: tprintf.h:31
#define NULL
Definition: host.h:144
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
float ComputeCost(const float features[]) const
double language_model_penalty_non_freq_dict_word
double language_model_penalty_non_dict_word
#define ASSERT_HOST(x)
Definition: errcode.h:84
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 116 of file language_model.h.

116  {
117  if (num_problems == 0) return 0.0f;
118  if (num_problems == 1) return penalty;
119  return (penalty + (language_model_penalty_increment *
120  static_cast<float>(num_problems-1)));
121  }
void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 272 of file language_model.h.

276  {
278  col, row,
279  (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
280  (parent_vse != NULL) ? parent_vse->length : 0,
281  fixed_pitch_, max_char_wh_ratio,
282  word_res, language_model_debug_level > 2, associate_stats);
283  }
#define NULL
Definition: host.h:144
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:37
float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 127 of file language_model.h.

129  {
130  if (dawg_info != NULL) {
131  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
133  (consistency_info.inconsistent_script ?
135  }
136  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
138  ComputeAdjustment(consistency_info.NumInconsistentCase(),
140  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
142  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
144  (consistency_info.inconsistent_script ?
146  (consistency_info.inconsistent_font ?
148  }
#define NULL
Definition: host.h:144
float ComputeAdjustment(int num_problems, float penalty)
float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 980 of file language_model.cpp.

980  {
981  if (curr_list->empty()) return 1.0f;
982  float denom = 0.0f;
983  int len = 0;
984  BLOB_CHOICE_IT c_it(curr_list);
985  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
986  ASSERT_HOST(c_it.data() != NULL);
987  ++len;
988  denom += CertaintyScore(c_it.data()->certainty());
989  }
990  assert(len != 0);
991  // The ideal situation would be to have the classifier scores for
992  // classifying each position as each of the characters in the unicharset.
993  // Since we can not do this because of speed, we add a very crude estimate
994  // of what these scores for the "missing" classifications would sum up to.
995  denom += (dict_->getUnicharset().size() - len) *
997 
998  return denom;
999 }
float CertaintyScore(float cert)
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
double language_model_ngram_nonmatch_score
int size() const
Definition: unicharset.h:297
#define ASSERT_HOST(x)
Definition: errcode.h:84
float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 920 of file language_model.cpp.

926  {
927  const char *context_ptr = context;
928  char *modified_context = NULL;
929  char *modified_context_end = NULL;
930  const char *unichar_ptr = unichar;
931  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
932  float prob = 0.0f;
933  int step = 0;
934  while (unichar_ptr < unichar_end &&
935  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
936  if (language_model_debug_level > 1) {
937  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
938  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
939  }
940  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
941  ++(*unichar_step_len);
943  unichar_ptr += step;
944  // If there are multiple UTF8 characters present in unichar, context is
945  // updated to include the previously examined characters from str,
946  // unless use_only_first_uft8_step is true.
947  if (unichar_ptr < unichar_end) {
948  if (modified_context == NULL) {
949  int context_len = strlen(context);
950  modified_context =
951  new char[context_len + strlen(unichar_ptr) + step + 1];
952  strncpy(modified_context, context, context_len);
953  modified_context_end = modified_context + context_len;
954  context_ptr = modified_context;
955  }
956  strncpy(modified_context_end, unichar_ptr - step, step);
957  modified_context_end += step;
958  *modified_context_end = '\0';
959  }
960  }
961  prob /= static_cast<float>(*unichar_step_len); // normalize
962  if (prob < language_model_ngram_small_prob) {
963  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
964  *found_small_prob = true;
966  }
967  *ngram_cost = -1.0*log2(prob);
968  float ngram_and_classifier_cost =
969  -1.0*log2(CertaintyScore(certainty)/denom) +
970  *ngram_cost * language_model_ngram_scale_factor;
971  if (language_model_debug_level > 1) {
972  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
973  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
974  ngram_and_classifier_cost);
975  }
976  if (modified_context != NULL) delete[] modified_context;
977  return ngram_and_classifier_cost;
978 }
float CertaintyScore(float cert)
#define tprintf(...)
Definition: tprintf.h:31
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
#define NULL
Definition: host.h:144
bool language_model_ngram_use_only_first_uft8_step
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:363
WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1374 of file language_model.cpp.

1379  {
1380  if (truth_path != NULL) {
1381  *truth_path =
1382  (blamer_bundle != NULL &&
1383  vse->length == blamer_bundle->correct_segmentation_length());
1384  }
1385  BLOB_CHOICE *curr_b = vse->curr_b;
1386  ViterbiStateEntry *curr_vse = vse;
1387 
1388  int i;
1389  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1390 
1391  // Re-compute the variance of the width-to-height ratios (since we now
1392  // can compute the mean over the whole word).
1393  float full_wh_ratio_mean = 0.0f;
1394  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1395  vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1396  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1397  static_cast<float>(vse->length));
1398  vse->associate_stats.full_wh_ratio_var = 0.0f;
1399  }
1400 
1401  // Construct a WERD_CHOICE by tracing parent pointers.
1402  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1403  word->set_length(vse->length);
1404  int total_blobs = 0;
1405  for (i = (vse->length-1); i >= 0; --i) {
1406  if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
1407  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1408  *truth_path = false;
1409  }
1410  // The number of blobs used for this choice is row - col + 1.
1411  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1412  total_blobs += num_blobs;
1413  word->set_blob_choice(i, num_blobs, curr_b);
1414  // Update the width-to-height ratio variance. Useful non-space delimited
1415  // languages to ensure that the blobs are of uniform width.
1416  // Skip leading and trailing punctuation when computing the variance.
1417  if ((full_wh_ratio_mean != 0.0f &&
1418  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1419  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1420  vse->associate_stats.full_wh_ratio_var +=
1421  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1422  if (language_model_debug_level > 2) {
1423  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1424  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1425  }
1426  }
1427 
1428  // Mark the word as compound if compound permuter was set for any of
1429  // the unichars on the path (usually this will happen for unichars
1430  // that are compounding operators, like "-" and "/").
1431  if (!compound && curr_vse->dawg_info &&
1432  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1433 
1434  // Update curr_* pointers.
1435  curr_vse = curr_vse->parent_vse;
1436  if (curr_vse == NULL) break;
1437  curr_b = curr_vse->curr_b;
1438  }
1439  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1440  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1441  // Re-adjust shape cost to include the updated width-to-height variance.
1442  if (full_wh_ratio_mean != 0.0f) {
1443  vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1444  }
1445 
1446  word->set_rating(vse->ratings_sum);
1447  word->set_certainty(vse->min_certainty);
1448  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1449  vse->consistency_info.BodyMaxXHeight());
1450  if (vse->dawg_info != NULL) {
1451  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1452  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1453  word->set_permuter(NGRAM_PERM);
1454  } else if (vse->top_choice_flags) {
1456  } else {
1457  word->set_permuter(NO_PERM);
1458  }
1459  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1460  word_res->ratings));
1461  return word;
1462 }
#define tprintf(...)
Definition: tprintf.h:31
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:339
MATRIX * ratings
Definition: pageres.h:215
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:114
#define NULL
Definition: host.h:144
const UNICHARSET * uch_set
Definition: pageres.h:192
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:125
void set_rating(float new_val)
Definition: ratngs.h:366
int dimension() const
Definition: matrix.h:247
void set_length(int len)
Definition: ratngs.h:378
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:152
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:290
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:363
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_certainty(float new_val)
Definition: ratngs.h:369
int correct_segmentation_length() const
Definition: blamer.h:126
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

Definition at line 1325 of file language_model.cpp.

1326  {
1327  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1328  // Record dictionary match info.
1329  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1330  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1331  if (vse.dawg_info != NULL) {
1332  int permuter = vse.dawg_info->permuter;
1333  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1334  if (vse.consistency_info.num_digits == vse.length) {
1335  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1336  } else {
1337  features[PTRAIN_NUM_SHORT+len] = 1.0;
1338  }
1339  } else if (permuter == DOC_DAWG_PERM) {
1340  features[PTRAIN_DOC_SHORT+len] = 1.0;
1341  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1342  permuter == COMPOUND_PERM) {
1343  features[PTRAIN_DICT_SHORT+len] = 1.0;
1344  } else if (permuter == FREQ_DAWG_PERM) {
1345  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1346  }
1347  }
1348  // Record shape cost feature (normalized by path length).
1349  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1350  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1351  // Record ngram cost. (normalized by the path length).
1352  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1353  if (vse.ngram_info != NULL) {
1354  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1355  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1356  }
1357  // Record consistency-related features.
1358  // Disabled this feature for due to its poor performance.
1359  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1360  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1361  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1362  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
1363  vse.consistency_info.NumInconsistentChartype() : 0.0;
1364  features[PTRAIN_NUM_BAD_SPACING] =
1365  vse.consistency_info.NumInconsistentSpaces();
1366  // Disabled this feature for now due to its poor performance.
1367  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1368 
1369  // Classifier-related features.
1370  features[PTRAIN_RATING_PER_CHAR] =
1371  vse.ratings_sum / static_cast<float>(vse.outline_length);
1372 }
#define NULL
Definition: host.h:144
void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

Definition at line 1001 of file language_model.cpp.

1007  {
1008  const UNICHARSET &unicharset = dict_->getUnicharset();
1009  UNICHAR_ID unichar_id = b->unichar_id();
1010  BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
1011 
1012  // Check punctuation validity.
1013  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1014  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
1015  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
1016  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1017  unicharset.get_isdigit(parent_b->unichar_id()))) {
1018  // reset punc_ref for compound words
1019  consistency_info->punc_ref = NO_EDGE;
1020  } else {
1021  bool is_apos = dict_->is_apostrophe(unichar_id);
1022  bool prev_is_numalpha = (parent_b != NULL &&
1023  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1024  unicharset.get_isdigit(parent_b->unichar_id())));
1025  UNICHAR_ID pattern_unichar_id =
1026  (unicharset.get_isalpha(unichar_id) ||
1027  unicharset.get_isdigit(unichar_id) ||
1028  (is_apos && prev_is_numalpha)) ?
1029  Dawg::kPatternUnicharID : unichar_id;
1030  if (consistency_info->punc_ref == NO_EDGE ||
1031  pattern_unichar_id != Dawg::kPatternUnicharID ||
1032  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1035  consistency_info->punc_ref);
1036  consistency_info->punc_ref =
1037  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1038  node, pattern_unichar_id, word_end) : NO_EDGE;
1039  if (consistency_info->punc_ref == NO_EDGE) {
1040  consistency_info->invalid_punc = true;
1041  }
1042  }
1043  }
1044  }
1045 
1046  // Update case related counters.
1047  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1048  // Reset counters if we are dealing with a compound word.
1049  consistency_info->num_lower = 0;
1050  consistency_info->num_non_first_upper = 0;
1051  }
1052  else if (unicharset.get_islower(unichar_id)) {
1053  consistency_info->num_lower++;
1054  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1055  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1056  consistency_info->num_lower > 0 ||
1057  consistency_info->num_non_first_upper > 0) {
1058  consistency_info->num_non_first_upper++;
1059  }
1060  }
1061 
1062  // Initialize consistency_info->script_id (use script of unichar_id
1063  // if it is not Common, use script id recorded by the parent otherwise).
1064  // Set inconsistent_script to true if the script of the current unichar
1065  // is not consistent with that of the parent.
1066  consistency_info->script_id = unicharset.get_script(unichar_id);
1067  // Hiragana and Katakana can mix with Han.
1069  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1070  consistency_info->script_id == unicharset.hiragana_sid()) ||
1071  (unicharset.katakana_sid() != unicharset.null_sid() &&
1072  consistency_info->script_id == unicharset.katakana_sid())) {
1073  consistency_info->script_id = dict_->getUnicharset().han_sid();
1074  }
1075  }
1076 
1077  if (parent_vse != NULL &&
1078  (parent_vse->consistency_info.script_id !=
1079  dict_->getUnicharset().common_sid())) {
1080  int parent_script_id = parent_vse->consistency_info.script_id;
1081  // If script_id is Common, use script id of the parent instead.
1082  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1083  consistency_info->script_id = parent_script_id;
1084  }
1085  if (consistency_info->script_id != parent_script_id) {
1086  consistency_info->inconsistent_script = true;
1087  }
1088  }
1089 
1090  // Update chartype related counters.
1091  if (unicharset.get_isalpha(unichar_id)) {
1092  consistency_info->num_alphas++;
1093  } else if (unicharset.get_isdigit(unichar_id)) {
1094  consistency_info->num_digits++;
1095  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1096  consistency_info->num_other++;
1097  }
1098 
1099  // Check font and spacing consistency.
1100  if (fontinfo_table_->size() > 0 && parent_b != NULL) {
1101  int fontinfo_id = -1;
1102  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1103  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1104  fontinfo_id = b->fontinfo_id();
1105  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1106  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1107  fontinfo_id = b->fontinfo_id2();
1108  }
1109  if(language_model_debug_level > 1) {
1110  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1111  (parent_b->fontinfo_id() >= 0) ?
1112  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1113  (parent_b->fontinfo_id2() >= 0) ?
1114  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1115  (b->fontinfo_id() >= 0) ?
1116  fontinfo_table_->get(b->fontinfo_id()).name : "",
1117  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1118  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1119  fontinfo_id);
1120  }
1121  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1122  bool expected_gap_found = false;
1123  float expected_gap;
1124  int temp_gap;
1125  if (fontinfo_id >= 0) { // found a common font
1126  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1127  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1128  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1129  expected_gap = temp_gap;
1130  expected_gap_found = true;
1131  }
1132  } else {
1133  consistency_info->inconsistent_font = true;
1134  // Get an average of the expected gaps in each font
1135  int num_addends = 0;
1136  expected_gap = 0;
1137  int temp_fid;
1138  for (int i = 0; i < 4; ++i) {
1139  if (i == 0) {
1140  temp_fid = parent_b->fontinfo_id();
1141  } else if (i == 1) {
1142  temp_fid = parent_b->fontinfo_id2();
1143  } else if (i == 2) {
1144  temp_fid = b->fontinfo_id();
1145  } else {
1146  temp_fid = b->fontinfo_id2();
1147  }
1148  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1149  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1150  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1151  expected_gap += temp_gap;
1152  num_addends++;
1153  }
1154  }
1155  expected_gap_found = (num_addends > 0);
1156  if (num_addends > 0) {
1157  expected_gap /= static_cast<float>(num_addends);
1158  }
1159  }
1160  if (expected_gap_found) {
1161  float actual_gap =
1162  static_cast<float>(word_res->GetBlobsGap(curr_col-1));
1163  float gap_ratio = expected_gap / actual_gap;
1164  // TODO(rays) The gaps seem to be way off most of the time, saved by
1165  // the error here that the ratio was compared to 1/2, when it should
1166  // have been 0.5f. Find the source of the gaps discrepancy and put
1167  // the 0.5f here in place of 0.0f.
1168  // Test on 2476595.sj, pages 0 to 6. (In French.)
1169  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1170  consistency_info->num_inconsistent_spaces++;
1171  }
1172  if (language_model_debug_level > 1) {
1173  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1174  unicharset.id_to_unichar(parent_b->unichar_id()),
1175  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1176  unichar_id, curr_col, expected_gap, actual_gap);
1177  }
1178  }
1179  }
1180  }
1181 }
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
name_table name
inT16 fontinfo_id() const
Definition: ratngs.h:85
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:408
GenericVector< int > blob_widths
Definition: pageres.h:205
#define tprintf(...)
Definition: tprintf.h:31
int han_sid() const
Definition: unicharset.h:836
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
int null_sid() const
Definition: unicharset.h:831
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
#define NULL
Definition: host.h:144
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:116
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
int UNICHAR_ID
Definition: unichar.h:33
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
const UnicityTable< FontInfo > * fontinfo_table_
bool empty() const
Definition: genericvector.h:84
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
int common_sid() const
Definition: unicharset.h:832
inT64 NODE_REF
Definition: dawg.h:55
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
int hiragana_sid() const
Definition: unicharset.h:837
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
inT16 fontinfo_id2() const
Definition: ratngs.h:88
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
int katakana_sid() const
Definition: unicharset.h:838
LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 772 of file language_model.cpp.

776  {
777  // Initialize active_dawgs from parent_vse if it is not NULL.
778  // Otherwise use very_beginning_active_dawgs_.
779  if (parent_vse == NULL) {
782  } else {
783  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
784  dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs;
785  dawg_args_->permuter = parent_vse->dawg_info->permuter;
786  }
787 
788  // Deal with hyphenated words.
789  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
790  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
791  return new LanguageModelDawgInfo(dawg_args_->active_dawgs,
792  COMPOUND_PERM);
793  }
794 
795  // Deal with compound words.
796  if (dict_->compound_marker(b.unichar_id()) &&
797  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
798  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
799  // Do not allow compound operators at the beginning and end of the word.
800  // Do not allow more than one compound operator per word.
801  // Do not allow compounding of words with lengths shorter than
802  // language_model_min_compound_length
803  if (parent_vse == NULL || word_end ||
805  parent_vse->length < language_model_min_compound_length) return NULL;
806 
807  int i;
808  // Check a that the path terminated before the current character is a word.
809  bool has_word_ending = false;
810  for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
811  const DawgPosition &pos = (*parent_vse->dawg_info->active_dawgs)[i];
812  const Dawg *pdawg = pos.dawg_index < 0
813  ? NULL : dict_->GetDawg(pos.dawg_index);
814  if (pdawg == NULL || pos.back_to_punc) continue;;
815  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
816  pdawg->end_of_word(pos.dawg_ref)) {
817  has_word_ending = true;
818  break;
819  }
820  }
821  if (!has_word_ending) return NULL;
822 
823  if (language_model_debug_level > 0) tprintf("Compound word found\n");
824  return new LanguageModelDawgInfo(beginning_active_dawgs_, COMPOUND_PERM);
825  } // done dealing with compound words
826 
827  LanguageModelDawgInfo *dawg_info = NULL;
828 
829  // Call LetterIsOkay().
830  // Use the normalized IDs so that all shapes of ' can be allowed in words
831  // like don't.
832  const GenericVector<UNICHAR_ID>& normed_ids =
834  DawgPositionVector tmp_active_dawgs;
835  for (int i = 0; i < normed_ids.size(); ++i) {
837  tprintf("Test Letter OK for unichar %d, normed %d\n",
838  b.unichar_id(), normed_ids[i]);
839  dict_->LetterIsOkay(dawg_args_, normed_ids[i],
840  word_end && i == normed_ids.size() - 1);
841  if (dawg_args_->permuter == NO_PERM) {
842  break;
843  } else if (i < normed_ids.size() - 1) {
844  tmp_active_dawgs = *dawg_args_->updated_dawgs;
845  dawg_args_->active_dawgs = &tmp_active_dawgs;
846  }
848  tprintf("Letter was OK for unichar %d, normed %d\n",
849  b.unichar_id(), normed_ids[i]);
850  }
852  if (dawg_args_->permuter != NO_PERM) {
853  dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_dawgs,
855  } else if (language_model_debug_level > 3) {
856  tprintf("Letter %s not OK!\n",
858  }
859 
860  return dawg_info;
861 }
DawgPositionVector * beginning_active_dawgs_
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:350
DawgPositionVector * active_dawgs
Definition: dict.h:81
#define NULL
Definition: host.h:144
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
DawgPositionVector * updated_dawgs
Definition: dict.h:82
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
DawgPositionVector * very_beginning_active_dawgs_
PermuterType permuter
Definition: dict.h:83
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:406
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 863 of file language_model.cpp.

866  {
867  // Initialize parent context.
868  const char *pcontext_ptr = "";
869  int pcontext_unichar_step_len = 0;
870  if (parent_vse == NULL) {
871  pcontext_ptr = prev_word_str_.string();
872  pcontext_unichar_step_len = prev_word_unichar_step_len_;
873  } else {
874  pcontext_ptr = parent_vse->ngram_info->context.string();
875  pcontext_unichar_step_len =
876  parent_vse->ngram_info->context_unichar_step_len;
877  }
878  // Compute p(unichar | parent context).
879  int unichar_step_len = 0;
880  bool pruned = false;
881  float ngram_cost;
882  float ngram_and_classifier_cost =
883  ComputeNgramCost(unichar, certainty, denom,
884  pcontext_ptr, &unichar_step_len,
885  &pruned, &ngram_cost);
886  // Normalize just the ngram_and_classifier_cost by outline_length.
887  // The ngram_cost is used by the params_model, so it needs to be left as-is,
888  // and the params model cost will be normalized by outline_length.
889  ngram_and_classifier_cost *=
890  outline_length / language_model_ngram_rating_factor;
891  // Add the ngram_cost of the parent.
892  if (parent_vse != NULL) {
893  ngram_and_classifier_cost +=
894  parent_vse->ngram_info->ngram_and_classifier_cost;
895  ngram_cost += parent_vse->ngram_info->ngram_cost;
896  }
897 
898  // Shorten parent context string by unichar_step_len unichars.
899  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
901  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
902  while (num_remove > 0 && *pcontext_ptr != '\0') {
903  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
904  --num_remove;
905  }
906 
907  // Decide whether to prune this ngram path and update changed accordingly.
908  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
909 
910  // Construct and return the new LanguageModelNgramInfo.
911  LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
912  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
913  ngram_and_classifier_cost);
914  ngram_info->context += unichar;
915  ngram_info->context_unichar_step_len += unichar_step_len;
916  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
917  return ngram_info;
918 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

Definition at line 756 of file language_model.cpp.

758  {
759  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
760  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
761  new_vse->cost >= vit.data()->cost; vit.forward()) {
762  // Clear the appropriate flags if the list already contains
763  // a top choice entry with a lower cost.
764  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
765  }
766  if (language_model_debug_level > 2) {
767  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
768  new_vse->top_choice_flags);
769  }
770 }
#define tprintf(...)
Definition: tprintf.h:31
ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Definition at line 487 of file language_model.cpp.

491  {
492  for (; !vse_it->cycled_list(); vse_it->forward()) {
493  ViterbiStateEntry* parent_vse = vse_it->data();
494  // Only consider the parent if it has been updated or
495  // if the current ratings cell has just been classified.
496  if (!just_classified && !parent_vse->updated) continue;
498  parent_vse->Print("Considering");
499  // If the parent is non-alnum, then upper counts as lower.
500  *top_choice_flags = blob_choice_flags;
501  if ((blob_choice_flags & kUpperCaseFlag) &&
502  !parent_vse->HasAlnumChoice(unicharset)) {
503  *top_choice_flags |= kLowerCaseFlag;
504  }
505  *top_choice_flags &= parent_vse->top_choice_flags;
506  UNICHAR_ID unichar_id = bc->unichar_id();
507  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
508  UNICHAR_ID parent_id = parent_b->unichar_id();
509  // Digits do not bind to alphas if there is a mix in both parent and current
510  // or if the alpha is not the top choice.
511  if (unicharset.get_isdigit(unichar_id) &&
512  unicharset.get_isalpha(parent_id) &&
513  (mixed_alnum || *top_choice_flags == 0))
514  continue; // Digits don't bind to alphas.
515  // Likewise alphas do not bind to digits if there is a mix in both or if
516  // the digit is not the top choice.
517  if (unicharset.get_isalpha(unichar_id) &&
518  unicharset.get_isdigit(parent_id) &&
519  (mixed_alnum || *top_choice_flags == 0))
520  continue; // Alphas don't bind to digits.
521  // If there is a case mix of the same alpha in the parent list, then
522  // competing_vse is non-null and will be used to determine whether
523  // or not to bind the current blob choice.
524  if (parent_vse->competing_vse != NULL) {
525  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
526  UNICHAR_ID other_id = competing_b->unichar_id();
527  if (language_model_debug_level >= 5) {
528  tprintf("Parent %s has competition %s\n",
529  unicharset.id_to_unichar(parent_id),
530  unicharset.id_to_unichar(other_id));
531  }
532  if (unicharset.SizesDistinct(parent_id, other_id)) {
533  // If other_id matches bc wrt position and size, and parent_id, doesn't,
534  // don't bind to the current parent.
535  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
537  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
539  continue; // Competing blobchoice has a better vertical match.
540  }
541  }
542  vse_it->forward();
543  return parent_vse; // This one is good!
544  }
545  return NULL; // Ran out of possibilities.
546 }
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472
#define tprintf(...)
Definition: tprintf.h:31
#define NULL
Definition: host.h:144
float x_height
Definition: pageres.h:295
int UNICHAR_ID
Definition: unichar.h:33
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
static const LanguageModelFlagsType kUpperCaseFlag
static const LanguageModelFlagsType kLowerCaseFlag
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:132
ParamsModel& tesseract::LanguageModel::getParamsModel ( )
inline

Definition at line 100 of file language_model.h.

100 { return params_model_; }
bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Definition at line 374 of file language_model.cpp.

377  {
378  BLOB_CHOICE_IT c_it(curr_list);
379  const UNICHARSET &unicharset = dict_->getUnicharset();
380  BLOB_CHOICE *first_unichar = NULL;
381  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
382  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
383  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
384  if (first_unichar == NULL) first_unichar = c_it.data();
385  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
386  *first_lower = c_it.data();
387  }
388  if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
389  !unicharset.get_islower(unichar_id)) {
390  *first_upper = c_it.data();
391  }
392  if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
393  *first_digit = c_it.data();
394  }
395  }
396  ASSERT_HOST(first_unichar != NULL);
397  bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
398  *first_digit != NULL;
399  if (*first_lower == NULL) *first_lower = first_unichar;
400  if (*first_upper == NULL) *first_upper = first_unichar;
401  if (*first_digit == NULL) *first_digit = first_unichar;
402  return mixed;
403 }
#define NULL
Definition: host.h:144
int UNICHAR_ID
Definition: unichar.h:33
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
Definition: cluster.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 138 of file language_model.cpp.

140  {
141  fixed_pitch_ = fixed_pitch;
142  max_char_wh_ratio_ = max_char_wh_ratio;
143  rating_cert_scale_ = rating_cert_scale;
144  acceptable_choice_found_ = false;
146 
147  // Initialize vectors with beginning DawgInfos.
152 
153  // Fill prev_word_str_ with the last language_model_ngram_order
154  // unichars from prev_word.
156  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
157  prev_word_str_ = prev_word->unichar_string();
159  } else {
160  prev_word_str_ = " ";
161  }
162  const char *str_ptr = prev_word_str_.string();
163  const char *str_end = str_ptr + prev_word_str_.length();
164  int step;
166  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
167  str_ptr += step;
169  }
170  ASSERT_HOST(str_ptr == str_end);
171  }
172 }
DawgPositionVector * beginning_active_dawgs_
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
inT32 length() const
Definition: strngs.cpp:188
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
const STRING & unichar_string() const
Definition: ratngs.h:524
DawgPositionVector * very_beginning_active_dawgs_
bool language_model_ngram_space_delimited_language
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 291 of file language_model.h.

291  {
292  if (vse.top_choice_flags) return false;
293  if (vse.dawg_info != NULL &&
294  (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
295  vse.dawg_info->permuter == USER_DAWG_PERM ||
296  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
297  return true;
298  }
#define NULL
Definition: host.h:144
void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

Definition at line 96 of file language_model.h.

96  {
98  }
int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Definition at line 412 of file language_model.cpp.

413  {
414  if (parent_node == NULL) return -1;
415  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
416  ViterbiStateEntry* top_lower = NULL;
417  ViterbiStateEntry* top_upper = NULL;
418  ViterbiStateEntry* top_digit = NULL;
419  ViterbiStateEntry* top_choice = NULL;
420  float lower_rating = 0.0f;
421  float upper_rating = 0.0f;
422  float digit_rating = 0.0f;
423  float top_rating = 0.0f;
424  const UNICHARSET &unicharset = dict_->getUnicharset();
425  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
426  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
427  ViterbiStateEntry* vse = vit.data();
428  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
429  // back to the real character if needed.
430  ViterbiStateEntry* unichar_vse = vse;
431  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
432  float rating = unichar_vse->curr_b->rating();
433  while (unichar_id == INVALID_UNICHAR_ID &&
434  unichar_vse->parent_vse != NULL) {
435  unichar_vse = unichar_vse->parent_vse;
436  unichar_id = unichar_vse->curr_b->unichar_id();
437  rating = unichar_vse->curr_b->rating();
438  }
439  if (unichar_id != INVALID_UNICHAR_ID) {
440  if (unicharset.get_islower(unichar_id)) {
441  if (top_lower == NULL || lower_rating > rating) {
442  top_lower = vse;
443  lower_rating = rating;
444  }
445  } else if (unicharset.get_isalpha(unichar_id)) {
446  if (top_upper == NULL || upper_rating > rating) {
447  top_upper = vse;
448  upper_rating = rating;
449  }
450  } else if (unicharset.get_isdigit(unichar_id)) {
451  if (top_digit == NULL || digit_rating > rating) {
452  top_digit = vse;
453  digit_rating = rating;
454  }
455  }
456  }
457  if (top_choice == NULL || top_rating > rating) {
458  top_choice = vse;
459  top_rating = rating;
460  top_id = unichar_id;
461  }
462  }
463  if (top_choice == NULL) return -1;
464  bool mixed = (top_lower != NULL || top_upper != NULL) &&
465  top_digit != NULL;
466  if (top_lower == NULL) top_lower = top_choice;
467  top_lower->top_choice_flags |= kLowerCaseFlag;
468  if (top_upper == NULL) top_upper = top_choice;
469  top_upper->top_choice_flags |= kUpperCaseFlag;
470  if (top_digit == NULL) top_digit = top_choice;
471  top_digit->top_choice_flags |= kDigitFlag;
472  top_choice->top_choice_flags |= kSmallestRatingFlag;
473  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
474  (top_choice->top_choice_flags &
476  // If the compound marker top choice carries any of the top alnum flags,
477  // then give it all of them, allowing words like I-295 to be chosen.
478  top_choice->top_choice_flags |=
480  }
481  return mixed ? 1 : 0;
482 }
#define NULL
Definition: host.h:144
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
int UNICHAR_ID
Definition: unichar.h:33
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
static const LanguageModelFlagsType kDigitFlag
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
static const LanguageModelFlagsType kUpperCaseFlag
static const LanguageModelFlagsType kSmallestRatingFlag
static const LanguageModelFlagsType kLowerCaseFlag
Definition: cluster.h:45
void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1225 of file language_model.cpp.

1230  {
1231  bool truth_path;
1232  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1233  blamer_bundle, &truth_path);
1234  ASSERT_HOST(word != NULL);
1235  if (dict_->stopper_debug_level >= 1) {
1236  STRING word_str;
1237  word->string_and_lengths(&word_str, NULL);
1238  vse->Print(word_str.string());
1239  }
1240  if (language_model_debug_level > 0) {
1241  word->print("UpdateBestChoice() constructed word");
1242  }
1243  // Record features from the current path if necessary.
1244  ParamsTrainingHypothesis curr_hyp;
1245  if (blamer_bundle != NULL) {
1246  if (vse->dawg_info != NULL) vse->dawg_info->permuter =
1247  static_cast<PermuterType>(word->permuter());
1248  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1249  word->string_and_lengths(&(curr_hyp.str), NULL);
1250  curr_hyp.cost = vse->cost; // record cost for error rate computations
1251  if (language_model_debug_level > 0) {
1252  tprintf("Raw features extracted from %s (cost=%g) [ ",
1253  curr_hyp.str.string(), curr_hyp.cost);
1254  for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
1255  tprintf("%g ", curr_hyp.features[deb_i]);
1256  }
1257  tprintf("]\n");
1258  }
1259  // Record the current hypothesis in params_training_bundle.
1260  blamer_bundle->AddHypothesis(curr_hyp);
1261  if (truth_path)
1262  blamer_bundle->UpdateBestRating(word->rating());
1263  }
1264  if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
1265  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1266  // we no longer need it.
1267  delete word;
1268  return;
1269  }
1270  if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
1271  word->SetScriptPositions(false, word_res->chopped_word);
1272  // Update and log new raw_choice if needed.
1273  if (word_res->raw_choice == NULL ||
1274  word->rating() < word_res->raw_choice->rating()) {
1275  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1276  tprintf("Updated raw choice\n");
1277  }
1278  // Set the modified rating for best choice to vse->cost and log best choice.
1279  word->set_rating(vse->cost);
1280  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1281  // computes adjust_factor that is used by the adaption code (e.g. by
1282  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1283  // Note: the rating of the word is not adjusted.
1284  dict_->adjust_word(word, vse->dawg_info == NULL,
1285  vse->consistency_info.xht_decision, 0.0,
1286  false, language_model_debug_level > 0);
1287  // Hand ownership of the word over to the word_res.
1289  dict_->stopper_debug_level >= 1, word)) {
1290  // The word was so bad that it was deleted.
1291  return;
1292  }
1293  if (word_res->best_choice == word) {
1294  // Word was the new best.
1295  if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1296  AcceptablePath(*vse)) {
1297  acceptable_choice_found_ = true;
1298  }
1299  // Update best_choice_bundle.
1300  best_choice_bundle->updated = true;
1301  best_choice_bundle->best_vse = vse;
1302  if (language_model_debug_level > 0) {
1303  tprintf("Updated best choice\n");
1304  word->print_state("New state ");
1305  }
1306  // Update hyphen state if we are dealing with a dictionary word.
1307  if (vse->dawg_info != NULL) {
1308  if (dict_->has_hyphen_end(*word)) {
1310  } else {
1311  dict_->reset_hyphen_vars(true);
1312  }
1313  }
1314 
1315  if (blamer_bundle != NULL) {
1317  vse->dawg_info != NULL && vse->top_choice_flags);
1318  }
1319  }
1320  if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
1321  word->DisplaySegmentation(word_res->chopped_word);
1322  }
1323 }
bool AcceptablePath(const ViterbiStateEntry &vse)
int tessedit_truncate_wordchoice_log
Definition: dict.h:618
TWERD * chopped_word
Definition: pageres.h:201
void print_state(const char *msg) const
Definition: ratngs.cpp:738
#define tprintf(...)
Definition: tprintf.h:31
DawgPositionVector * active_dawgs
Definition: dict.h:81
#define NULL
Definition: host.h:144
int stopper_debug_level
Definition: dict.h:612
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
void set_rating(float new_val)
Definition: ratngs.h:366
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
PermuterType
Definition: ratngs.h:240
float rating() const
Definition: ratngs.h:324
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:528
const char * string() const
Definition: strngs.cpp:193
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
uinT8 permuter() const
Definition: ratngs.h:343
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625
Definition: strngs.h:44
bool empty() const
Definition: genericvector.h:84
void print() const
Definition: ratngs.h:563
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:51
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
WERD_CHOICE * raw_choice
Definition: pageres.h:224
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135
WERD_CHOICE * best_choice
Definition: pageres.h:219
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:747
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
#define ASSERT_HOST(x)
Definition: errcode.h:84
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154
void UpdateBestRating(float rating)
Definition: blamer.h:122
bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 246 of file language_model.cpp.

254  {
255  if (language_model_debug_level > 0) {
256  tprintf("\nUpdateState: col=%d row=%d %s",
257  curr_col, curr_row, just_classified ? "just_classified" : "");
259  tprintf("(parent=%p)\n", parent_node);
260  else
261  tprintf("\n");
262  }
263  // Initialize helper variables.
264  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
265  bool new_changed = false;
266  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
267  const UNICHARSET& unicharset = dict_->getUnicharset();
268  BLOB_CHOICE *first_lower = NULL;
269  BLOB_CHOICE *first_upper = NULL;
270  BLOB_CHOICE *first_digit = NULL;
271  bool has_alnum_mix = false;
272  if (parent_node != NULL) {
273  int result = SetTopParentLowerUpperDigit(parent_node);
274  if (result < 0) {
276  tprintf("No parents found to process\n");
277  return false;
278  }
279  if (result > 0)
280  has_alnum_mix = true;
281  }
282  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
283  &first_digit))
284  has_alnum_mix = false;;
285  ScanParentsForCaseMix(unicharset, parent_node);
286  if (language_model_debug_level > 3 && parent_node != NULL) {
287  parent_node->Print("Parent viterbi list");
288  }
289  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
290 
291  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
292  ViterbiStateEntry_IT vit;
293  BLOB_CHOICE_IT c_it(curr_list);
294  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
295  BLOB_CHOICE* choice = c_it.data();
296  // TODO(antonova): make sure commenting this out if ok for ngram
297  // model scoring (I think this was introduced to fix ngram model quirks).
298  // Skip NULL unichars unless it is the only choice.
299  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
300  UNICHAR_ID unichar_id = choice->unichar_id();
301  if (unicharset.get_fragment(unichar_id)) {
302  continue; // Skip fragments.
303  }
304  // Set top choice flags.
305  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
306  if (c_it.at_first() || !new_changed)
307  blob_choice_flags |= kSmallestRatingFlag;
308  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
309  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
310  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
311 
312  if (parent_node == NULL) {
313  // Process the beginning of a word.
314  // If there is a better case variant that is not distinguished by size,
315  // skip this blob choice, as we have no choice but to accept the result
316  // of the character classifier to distinguish between them, even if
317  // followed by an upper case.
318  // With words like iPoc, and other CamelBackWords, the lower-upper
319  // transition can only be achieved if the classifier has the correct case
320  // as the top choice, and leaving an initial I lower down the list
321  // increases the chances of choosing IPoc simply because it doesn't
322  // include such a transition. iPoc will beat iPOC and ipoc because
323  // the other words are baseline/x-height inconsistent.
324  if (HasBetterCaseVariant(unicharset, choice, curr_list))
325  continue;
326  // Upper counts as lower at the beginning of a word.
327  if (blob_choice_flags & kUpperCaseFlag)
328  blob_choice_flags |= kLowerCaseFlag;
329  new_changed |= AddViterbiStateEntry(
330  blob_choice_flags, denom, word_end, curr_col, curr_row,
331  choice, curr_state, NULL, pain_points,
332  word_res, best_choice_bundle, blamer_bundle);
333  } else {
334  // Get viterbi entries from each parent ViterbiStateEntry.
335  vit.set_to_list(&parent_node->viterbi_state_entries);
336  int vit_counter = 0;
337  vit.mark_cycle_pt();
338  ViterbiStateEntry* parent_vse = NULL;
339  LanguageModelFlagsType top_choice_flags;
340  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
341  c_it.data(), blob_choice_flags,
342  unicharset, word_res, &vit,
343  &top_choice_flags)) != NULL) {
344  // Skip pruned entries and do not look at prunable entries if already
345  // examined language_model_viterbi_list_max_num_prunable of those.
346  if (PrunablePath(*parent_vse) &&
348  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
349  continue;
350  }
351  // If the parent has no alnum choice, (ie choice is the first in a
352  // string of alnum), and there is a better case variant that is not
353  // distinguished by size, skip this blob choice/parent, as with the
354  // initial blob treatment above.
355  if (!parent_vse->HasAlnumChoice(unicharset) &&
356  HasBetterCaseVariant(unicharset, choice, curr_list))
357  continue;
358  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
359  // looks good according to the Dawgs or character ngram model.
360  new_changed |= AddViterbiStateEntry(
361  top_choice_flags, denom, word_end, curr_col, curr_row,
362  c_it.data(), curr_state, parent_vse, pain_points,
363  word_res, best_choice_bundle, blamer_bundle);
364  }
365  }
366  }
367  return new_changed;
368 }
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
#define tprintf(...)
Definition: tprintf.h:31
MATRIX * ratings
Definition: pageres.h:215
static const LanguageModelFlagsType kXhtConsistentFlag
#define NULL
Definition: host.h:144
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool PrunablePath(const ViterbiStateEntry &vse)
int dimension() const
Definition: matrix.h:247
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
int language_model_viterbi_list_max_num_prunable
int UNICHAR_ID
Definition: unichar.h:33
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
static const LanguageModelFlagsType kDigitFlag
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
unsigned char LanguageModelFlagsType
Definition: lm_state.h:37
static const LanguageModelFlagsType kUpperCaseFlag
static const LanguageModelFlagsType kSmallestRatingFlag
static const LanguageModelFlagsType kLowerCaseFlag
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76

Member Data Documentation

bool tesseract::LanguageModel::acceptable_choice_found_
protected

Definition at line 408 of file language_model.h.

DawgPositionVector* tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 396 of file language_model.h.

bool tesseract::LanguageModel::correct_segmentation_explored_
protected

Definition at line 410 of file language_model.h.

DawgArgs* tesseract::LanguageModel::dawg_args_
protected

Definition at line 356 of file language_model.h.

Dict* tesseract::LanguageModel::dict_
protected

Definition at line 375 of file language_model.h.

bool tesseract::LanguageModel::fixed_pitch_
protected

Definition at line 382 of file language_model.h.

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_
protected

Definition at line 371 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

Definition at line 48 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 46 of file language_model.h.

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 53 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 45 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 47 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

Definition at line 49 of file language_model.h.

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 308 of file language_model.h.

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 335 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 322 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 310 of file language_model.h.

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 312 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 331 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 328 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 320 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 333 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 325 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 344 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 348 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 350 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 353 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 340 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 338 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 342 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 346 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 352 of file language_model.h.

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 356 of file language_model.h.

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 315 of file language_model.h.

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 317 of file language_model.h.

float tesseract::LanguageModel::max_char_wh_ratio_
protected

Definition at line 385 of file language_model.h.

ParamsModel tesseract::LanguageModel::params_model_
protected

Definition at line 413 of file language_model.h.

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 392 of file language_model.h.

int tesseract::LanguageModel::prev_word_unichar_step_len_
protected

Definition at line 393 of file language_model.h.

float tesseract::LanguageModel::rating_cert_scale_
protected

Definition at line 366 of file language_model.h.

DawgPositionVector* tesseract::LanguageModel::very_beginning_active_dawgs_
protected

Definition at line 395 of file language_model.h.

int tesseract::LanguageModel::wordrec_display_segmentations = 0

"Display Segmentations"

Definition at line 354 of file language_model.h.


The documentation for this class was generated from the following files: