Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cube_utils.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: cube_utils.cpp
3  * Description: Implementation of the Cube Utilities Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <math.h>
21 #include <string>
22 #include <vector>
23 #include "cube_utils.h"
24 #include "char_set.h"
25 #include "unichar.h"
26 
27 namespace tesseract {
29 }
30 
32 }
33 
34 // convert a prob to a cost (-ve log prob)
35 int CubeUtils::Prob2Cost(double prob_val) {
36  if (prob_val < MIN_PROB) {
37  return MIN_PROB_COST;
38  }
39  return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
40 }
41 
42 // converts a cost to probability
43 double CubeUtils::Cost2Prob(int cost) {
44  return exp(-cost / PROB2COST_SCALE);
45 }
46 
47 // computes the length of a NULL terminated char_32 string
48 int CubeUtils::StrLen(const char_32 *char_32_ptr) {
49  if (char_32_ptr == NULL) {
50  return 0;
51  }
52  int len = -1;
53  while (char_32_ptr[++len]);
54  return len;
55 }
56 
57 // compares two char_32 strings
58 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
59  const char_32 *pch1 = str1;
60  const char_32 *pch2 = str2;
61 
62  for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) {
63  if ((*pch1) != (*pch2)) {
64  return (*pch1) - (*pch2);
65  }
66  }
67 
68  if ((*pch1) == 0) {
69  if ((*pch2) == 0) {
70  return 0;
71  } else {
72  return -1;
73  }
74  } else {
75  return 1;
76  }
77 }
78 
79 // Duplicates a 32-bit char buffer
81  int len = StrLen(str32);
82  char_32 *new_str = new char_32[len + 1];
83  if (new_str == NULL) {
84  return NULL;
85  }
86  memcpy(new_str, str32, len * sizeof(*str32));
87  new_str[len] = 0;
88  return new_str;
89 }
90 
91 // creates a raw buffer from the specified location of the image
92 unsigned char *CubeUtils::GetImageData(IMAGE *img, int left,
93  int top, int wid, int hgt) {
94  // skip invalid dimensions
95  if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
96  (left + wid) > img->get_xsize() ||
97  (top + hgt) > img->get_ysize()) {
98  return NULL;
99  }
100 
101  // copy the char img to a temp buffer
102  unsigned char *temp_buff = new unsigned char[wid * hgt];
103  if (temp_buff == NULL) {
104  return NULL;
105  }
106 
107  IMAGELINE line;
108  line.init(wid);
109 
110  for (int y = 0, off = 0; y < hgt ; y++) {
111  img->get_line(left, img->get_ysize() - 1 - y - top, wid, &line, 0);
112  for (int x = 0; x < wid; x++, off++) {
113  temp_buff[off] = line.pixels[x] ? 255 : 0;
114  }
115  }
116 
117  return temp_buff;
118 }
119 
120 // creates a char samp from a specified portion of the image
122  int left, int top,
123  int wid, int hgt) {
124  // get the raw img data from the image
125  unsigned char *temp_buff = GetImageData(img, left, top, wid, hgt);
126  if (temp_buff == NULL) {
127  return NULL;
128  }
129 
130  // create a char samp from temp buffer
131  CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
132  // clean up temp buffer
133  delete []temp_buff;
134  return char_samp;
135 }
136 
137 // creates a char samp from a specified portion of the image
138 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
139  int wid, int hgt) {
140  // get the raw img data from the image
141  unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt);
142  if (temp_buff == NULL) {
143  return NULL;
144  }
145 
146  // create a char samp from temp buffer
147  CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
148 
149  // clean up temp buffer
150  delete []temp_buff;
151  return char_samp;
152 }
153 
154 // create a B/W image from a char_sample
156  // parameter check
157  if (char_samp == NULL) {
158  return NULL;
159  }
160 
161  // get the raw data
162  int stride = char_samp->Stride(),
163  wid = char_samp->Width(),
164  hgt = char_samp->Height();
165 
166  unsigned char *buff = char_samp->RawData();
167  if (buff == NULL) {
168  return NULL;
169  }
170 
171  // create a new image object
172  IMAGE *img = new IMAGE();
173  if (img == NULL) {
174  return NULL;
175  }
176 
177  // create a blank B/W image
178  if (img->create(wid, hgt, 1) == -1) {
179  delete img;
180  return NULL;
181  }
182 
183  // copy the contents
184  IMAGELINE line;
185  line.init(wid);
186 
187  for (int y = 0, off = 0; y < hgt ; y++, off += stride) {
188  for (int x = 0; x < wid; x++) {
189  line.pixels[x] = (buff[off + x] == 0) ? 0 : 1;
190  }
191 
192  img->fast_put_line(0, hgt - 1 - y, wid, &line);
193  }
194 
195  return img;
196 }
197 
198 // create a B/W image from a char_sample
200  // parameter check
201  if (char_samp == NULL) {
202  return NULL;
203  }
204 
205  // get the raw data
206  int stride = char_samp->Stride();
207  int wid = char_samp->Width();
208  int hgt = char_samp->Height();
209 
210  Pix *pix = pixCreate(wid, hgt, 1);
211  if (pix == NULL) {
212  return NULL;
213  }
214 
215  // copy the contents
216  unsigned char *line = char_samp->RawData();
217  for (int y = 0; y < hgt ; y++, line += stride) {
218  for (int x = 0; x < wid; x++) {
219  if (line[x] != 0) {
220  pixSetPixel(pix, x, y, 0);
221  } else {
222  pixSetPixel(pix, x, y, 255);
223  }
224  }
225  }
226 
227  return pix;
228 }
229 
230 // creates a raw buffer from the specified location of the pix
231 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
232  int wid, int hgt) {
233  // skip invalid dimensions
234  if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
235  (left + wid) > pix->w || (top + hgt) > pix->h ||
236  pix->d != 1) {
237  return NULL;
238  }
239 
240  // copy the char img to a temp buffer
241  unsigned char *temp_buff = new unsigned char[wid * hgt];
242  if (temp_buff == NULL) {
243  return NULL;
244  }
245 
246  l_int32 w;
247  l_int32 h;
248  l_int32 d;
249  l_int32 wpl;
250  l_uint32 *line;
251  l_uint32 *data;
252 
253  pixGetDimensions(pix, &w, &h, &d);
254  wpl = pixGetWpl(pix);
255  data = pixGetData(pix);
256  line = data + (top * wpl);
257 
258  for (int y = 0, off = 0; y < hgt ; y++) {
259  for (int x = 0; x < wid; x++, off++) {
260  temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255;
261  }
262  line += wpl;
263  }
264  return temp_buff;
265 }
266 
267 // read file contents to a string
268 bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
269  str->clear();
270  FILE *fp = fopen(file_name.c_str(), "rb");
271  if (fp == NULL) {
272  return false;
273  }
274 
275  // get the size of the size
276  fseek(fp, 0, SEEK_END);
277  int file_size = ftell(fp);
278  if (file_size < 1) {
279  fclose(fp);
280  return false;
281  }
282  // adjust string size
283  str->reserve(file_size);
284  // read the contents
285  rewind(fp);
286  char *buff = new char[file_size];
287  if (buff == NULL) {
288  fclose(fp);
289  return false;
290  }
291  int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp);
292  if (read_bytes == file_size) {
293  str->append(buff, file_size);
294  }
295  delete []buff;
296  fclose(fp);
297  return (read_bytes == file_size);
298 }
299 
300 // splits a string into vectors based on specified delimiters
301 void CubeUtils::SplitStringUsing(const string &str,
302  const string &delims,
303  vector<string> *str_vec) {
304  // Optimize the common case where delims is a single character.
305  if (delims[0] != '\0' && delims[1] == '\0') {
306  char c = delims[0];
307  const char* p = str.data();
308  const char* end = p + str.size();
309  while (p != end) {
310  if (*p == c) {
311  ++p;
312  } else {
313  const char* start = p;
314  while (++p != end && *p != c);
315  str_vec->push_back(string(start, p - start));
316  }
317  }
318  return;
319  }
320 
321  string::size_type begin_index, end_index;
322  begin_index = str.find_first_not_of(delims);
323  while (begin_index != string::npos) {
324  end_index = str.find_first_of(delims, begin_index);
325  if (end_index == string::npos) {
326  str_vec->push_back(str.substr(begin_index));
327  return;
328  }
329  str_vec->push_back(str.substr(begin_index, (end_index - begin_index)));
330  begin_index = str.find_first_not_of(delims, end_index);
331  }
332 }
333 
334 // UTF-8 to UTF-32 convesion functions
335 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
336  str32->clear();
337  int len = strlen(utf8_str);
338  int step = 0;
339  for (int ch = 0; ch < len; ch += step) {
340  step = UNICHAR::utf8_step(utf8_str + ch);
341  if (step > 0) {
342  UNICHAR uni_ch(utf8_str + ch, step);
343  (*str32) += uni_ch.first_uni();
344  }
345  }
346 }
347 
348 // UTF-8 to UTF-32 convesion functions
349 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
350  str->clear();
351  for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) {
352  UNICHAR uni_ch((*ch_32));
353  char *utf8 = uni_ch.utf8_str();
354  if (utf8 != NULL) {
355  (*str) += utf8;
356  delete []utf8;
357  }
358  }
359 }
360 
361 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
362  bool all_one_case = true;
363  bool capitalized;
364  bool prev_upper;
365  bool prev_lower;
366  bool first_upper;
367  bool first_lower;
368  bool cur_upper;
369  bool cur_lower;
370 
371  string str8;
372  if (!char_set) {
373  // If cube char_set is missing, use C-locale-dependent functions
374  // on UTF8 characters to determine case properties.
375  first_upper = isupper(str32[0]);
376  first_lower = islower(str32[0]);
377  if (first_upper)
378  capitalized = true;
379  prev_upper = first_upper;
380  prev_lower = islower(str32[0]);
381  for (int c = 1; str32[c] != 0; ++c) {
382  cur_upper = isupper(str32[c]);
383  cur_lower = islower(str32[c]);
384  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
385  all_one_case = false;
386  if (cur_upper)
387  capitalized = false;
388  prev_upper = cur_upper;
389  prev_lower = cur_lower;
390  }
391  } else {
392  UNICHARSET *unicharset = char_set->InternalUnicharset();
393  // Use UNICHARSET functions to determine case properties
394  first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
395  first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
396  if (first_upper)
397  capitalized = true;
398  prev_upper = first_upper;
399  prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
400 
401  for (int c = 1; c < StrLen(str32); ++c) {
402  cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
403  cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
404  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
405  all_one_case = false;
406  if (cur_upper)
407  capitalized = false;
408  prev_upper = cur_upper;
409  prev_lower = cur_lower;
410  }
411  }
412  return all_one_case || capitalized;
413 }
414 
415 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
416  if (!char_set) {
417  return NULL;
418  }
419  UNICHARSET *unicharset = char_set->InternalUnicharset();
420  int len = StrLen(str32);
421  char_32 *lower = new char_32[len + 1];
422  if (!lower)
423  return NULL;
424  for (int i = 0; i < len; ++i) {
425  char_32 ch = str32[i];
426  if (ch == INVALID_UNICHAR_ID) {
427  delete [] lower;
428  return NULL;
429  }
430  // convert upper-case characters to lower-case
431  if (unicharset->get_isupper(char_set->ClassID(ch))) {
432  UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
433  const char_32 *str32_lower = char_set->ClassString(uid_lower);
434  // expect lower-case version of character to be a single character
435  if (!str32_lower || StrLen(str32_lower) != 1) {
436  delete [] lower;
437  return NULL;
438  }
439  lower[i] = str32_lower[0];
440  } else {
441  lower[i] = ch;
442  }
443  }
444  lower[len] = 0;
445  return lower;
446 }
447 
448 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
449  if (!char_set) {
450  return NULL;
451  }
452  UNICHARSET *unicharset = char_set->InternalUnicharset();
453  int len = StrLen(str32);
454  char_32 *upper = new char_32[len + 1];
455  if (!upper)
456  return NULL;
457  for (int i = 0; i < len; ++i) {
458  char_32 ch = str32[i];
459  if (ch == INVALID_UNICHAR_ID) {
460  delete [] upper;
461  return NULL;
462  }
463  // convert lower-case characters to upper-case
464  if (unicharset->get_islower(char_set->ClassID(ch))) {
465  UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
466  const char_32 *str32_upper = char_set->ClassString(uid_upper);
467  // expect upper-case version of character to be a single character
468  if (!str32_upper || StrLen(str32_upper) != 1) {
469  delete [] upper;
470  return NULL;
471  }
472  upper[i] = str32_upper[0];
473  } else {
474  upper[i] = ch;
475  }
476  }
477  upper[len] = 0;
478  return upper;
479 }
480 } // namespace tesseract