Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
shapetable.h
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapetable.h
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 // Created: Thu Oct 28 17:46:32 PDT 2010
9 //
10 // (C) Copyright 2010, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
24 #define TESSERACT_CLASSIFY_SHAPETABLE_H_
25 
26 #include "genericvector.h"
27 #include "intmatcher.h"
28 
29 class STRING;
30 class UNICHARSET;
31 
32 namespace tesseract {
33 
34 // Simple struct to hold a set of fonts associated with a single unichar-id.
35 // A vector of UnicharAndFonts makes a shape.
38  }
39  UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
40  font_ids.push_back(font_id);
41  }
42 
43  // Writes to the given file. Returns false in case of error.
44  bool Serialize(FILE* fp) const;
45  // Reads from the given file. Returns false in case of error.
46  // If swap is true, assumes a big/little-endian swap is needed.
47  bool DeSerialize(bool swap, FILE* fp);
48 
49  // Sort function to sort a pair of UnicharAndFonts by unichar_id.
50  static int SortByUnicharId(const void* v1, const void* v2);
51 
54 };
55 
56 // A Shape is a collection of unichar-ids and a list of fonts associated with
57 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
58 // a classifiable unit, and represents a group of characters or parts of
59 // characters that have a similar or identical shape. Shapes/ShapeTables may
60 // be organized hierarchically from identical shapes at the leaves to vaguely
61 // similar shapes near the root.
62 class Shape {
63  public:
64  Shape() : destination_index_(-1) {}
65 
66  // Writes to the given file. Returns false in case of error.
67  bool Serialize(FILE* fp) const;
68  // Reads from the given file. Returns false in case of error.
69  // If swap is true, assumes a big/little-endian swap is needed.
70  bool DeSerialize(bool swap, FILE* fp);
71 
72  int destination_index() const {
73  return destination_index_;
74  }
75  void set_destination_index(int index) {
76  destination_index_ = index;
77  }
78  int size() const {
79  return unichars_.size();
80  }
81  // Returns a UnicharAndFonts entry for the given index, which must be
82  // in the range [0, size()).
83  const UnicharAndFonts& operator[](int index) const {
84  return unichars_[index];
85  }
86  // Adds a font_id for the given unichar_id. If the unichar_id is not
87  // in the shape, it is added.
88  void AddToShape(int unichar_id, int font_id);
89  // Adds everything in other to this.
90  void AddShape(const Shape& other);
91  // Returns true if the shape contains the given unichar_id, font_id pair.
92  bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
93  // Returns true if the shape contains the given unichar_id, ignoring font.
94  bool ContainsUnichar(int unichar_id) const;
95  // Returns true if the shape contains the given font, ignoring unichar_id.
96  bool ContainsFont(int font_id) const;
97  // Returns true if this is a subset (including equal) of other.
98  bool IsSubsetOf(const Shape& other) const;
99  // Returns true if the lists of unichar ids are the same in this and other,
100  // ignoring fonts.
101  // NOT const, as it will sort the unichars on demand.
102  bool IsEqualUnichars(Shape* other);
103 
104  private:
105  // Sorts the unichars_ vector by unichar.
106  void SortUnichars();
107 
108  // Flag indicates that the unichars are sorted, allowing faster set
109  // operations with another shape.
110  bool unichars_sorted_;
111  // If this Shape is part of a ShapeTable the destiation_index_ is the index
112  // of some other shape in the ShapeTable with which this shape is merged.
113  int destination_index_;
114  // Array of unichars, each with a set of fonts. Each unichar has at most
115  // one entry in the vector.
117 };
118 
119 // ShapeTable is a class to encapsulate the triple indirection that is
120 // used here.
121 // ShapeTable is a vector of shapes.
122 // Each shape is a vector of UnicharAndFonts representing the set of unichars
123 // that the shape represents.
124 // Each UnicharAndFonts also lists the fonts of the unichar_id that were
125 // mapped to the shape during training.
126 class ShapeTable {
127  public:
128  ShapeTable();
129  // The UNICHARSET reference supplied here, or in set_unicharset below must
130  // exist for the entire life of the ShapeTable. It is used only by DebugStr.
131  explicit ShapeTable(const UNICHARSET& unicharset);
132 
133  // Writes to the given file. Returns false in case of error.
134  bool Serialize(FILE* fp) const;
135  // Reads from the given file. Returns false in case of error.
136  // If swap is true, assumes a big/little-endian swap is needed.
137  bool DeSerialize(bool swap, FILE* fp);
138 
139  // Accessors.
140  int NumShapes() const {
141  return shape_table_.size();
142  }
143  const UNICHARSET& unicharset() const {
144  return *unicharset_;
145  }
146  // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
147  // entire life of the ShapeTable.
148  void set_unicharset(const UNICHARSET& unicharset) {
149  unicharset_ = &unicharset;
150  }
151  // Returns a string listing the classes/fonts in a shape.
152  STRING DebugStr(int shape_id) const;
153  // Returns a debug string summarizing the table.
154  STRING SummaryStr() const;
155 
156  // Adds a new shape starting with the given unichar_id and font_id.
157  // Returns the assigned index.
158  int AddShape(int unichar_id, int font_id);
159  // Adds a copy of the given shape.
160  // Returns the assigned index.
161  int AddShape(const Shape& other);
162  // Removes the shape given by the shape index. All indices above are changed!
163  void DeleteShape(int shape_id);
164  // Adds a font_id to the given existing shape index for the given
165  // unichar_id. If the unichar_id is not in the shape, it is added.
166  void AddToShape(int shape_id, int unichar_id, int font_id);
167  // Adds the given shape to the existing shape with the given index.
168  void AddShapeToShape(int shape_id, const Shape& other);
169  // Returns the id of the shape that contains the given unichar and font.
170  // If not found, returns -1.
171  // If font_id < 0, the font_id is ignored and the first shape that matches
172  // the unichar_id is returned.
173  int FindShape(int unichar_id, int font_id) const;
174  // Returns the first unichar_id and font_id in the given shape.
175  void GetFirstUnicharAndFont(int shape_id,
176  int* unichar_id, int* font_id) const;
177 
178  // Accessors for the Shape with the given shape_id.
179  const Shape& GetShape(int shape_id) const {
180  return *shape_table_[shape_id];
181  }
182  Shape* MutableShape(int shape_id) {
183  return shape_table_[shape_id];
184  }
185 
186  // Expands all the classes/fonts in the shape individually to build
187  // a ShapeTable.
188  int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
189 
190  // Returns true if the shapes are already merged.
191  bool AlreadyMerged(int shape_id1, int shape_id2) const;
192  // Returns true if any shape contains multiple unichars.
193  bool AnyMultipleUnichars() const;
194  // Returns the maximum number of unichars over all shapes.
195  int MaxNumUnichars() const;
196  // Merges shapes with a common unichar over the [start, end) interval.
197  // Assumes single unichar per shape.
198  void ForceFontMerges(int start, int end);
199  // Returns the number of unichars in the master shape.
200  int MasterUnicharCount(int shape_id) const;
201  // Returns the sum of the font counts in the master shape.
202  int MasterFontCount(int shape_id) const;
203  // Returns the number of unichars that would result from merging the shapes.
204  int MergedUnicharCount(int shape_id1, int shape_id2) const;
205  // Merges two shape_ids, leaving shape_id2 marked as merged.
206  void MergeShapes(int shape_id1, int shape_id2);
207  // Appends the master shapes from other to this.
208  // Used to create a clean ShapeTable from a merged one, or to create a
209  // copy of a ShapeTable.
210  void AppendMasterShapes(const ShapeTable& other);
211  // Returns the number of master shapes remaining after merging.
212  int NumMasterShapes() const;
213  // Returns the destination of this shape, (if merged), taking into account
214  // the fact that the destination may itself have been merged.
215  // For a non-merged shape, returns the input shape_id.
216  int MasterDestinationIndex(int shape_id) const;
217 
218  private:
219  // Pointer to a provided unicharset used only by the Debugstr member.
220  const UNICHARSET* unicharset_;
221  // Vector of pointers to the Shapes in this ShapeTable.
222  PointerVector<Shape> shape_table_;
223 };
224 
225 } // namespace tesseract.
226 
227 #endif // TESSERACT_CLASSIFY_SHAPETABLE_H_