Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::SquishedDawg Class Reference

#include <dawg.h>

Inheritance diagram for tesseract::SquishedDawg:
tesseract::Dawg

List of all members.

Public Member Functions

 SquishedDawg (FILE *file, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 SquishedDawg (const char *filename, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 SquishedDawg (EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 ~SquishedDawg ()
int NumEdges ()
EDGE_REF edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const
 Returns the edge that corresponds to the letter out of this node.
void unichar_ids_of (NODE_REF node, NodeChildVector *vec) const
NODE_REF next_node (EDGE_REF edge) const
bool end_of_word (EDGE_REF edge_ref) const
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
 Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
void print_node (NODE_REF node, int max_num_edges) const
void write_squished_dawg (FILE *file)
 Writes the squished/reduced Dawg to a file.
void write_squished_dawg (const char *filename)
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
const STRINGlang () const
PermuterType permuter () const
virtual ~Dawg ()
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg.
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const char * > *cb) const
virtual void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const

Additional Inherited Members

- Static Public Attributes inherited from tesseract::Dawg
static const inT16 kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file.
static const UNICHAR_ID kPatternUnicharID = 0
- Protected Member Functions inherited from tesseract::Dawg
 Dawg ()
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge.
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge.
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge.
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word.
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge.
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg.
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges.
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
void init (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
bool match_words (WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const char * > *cb) const
- Protected Attributes inherited from tesseract::Dawg
DawgType type_
STRING lang_
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg.
int unicharset_size_
int flag_start_bit_
int next_node_start_bit_
uinT64 next_node_mask_
uinT64 flags_mask_
uinT64 letter_mask_
int debug_level_

Detailed Description

Concrete class that can operate on a compacted (squished) Dawg (read, search and write to file). This class is read-only in the sense that new words can not be added to an instance of SquishedDawg. The underlying representation of the nodes and edges in SquishedDawg is stored as a contiguous EDGE_ARRAY (read from file or given as an argument to the constructor).

Definition at line 352 of file dawg.h.


Constructor & Destructor Documentation

tesseract::SquishedDawg::SquishedDawg ( FILE *  file,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level 
)
inline

Definition at line 354 of file dawg.h.

{
read_squished_dawg(file, type, lang, perm, debug_level);
num_forward_edges_in_node0 = num_forward_edges(0);
}
tesseract::SquishedDawg::SquishedDawg ( const char *  filename,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level 
)
inline

Definition at line 359 of file dawg.h.

{
FILE *file = fopen(filename, "rb");
if (file == NULL) {
tprintf("Failed to open dawg file %s\n", filename);
exit(1);
}
read_squished_dawg(file, type, lang, perm, debug_level);
num_forward_edges_in_node0 = num_forward_edges(0);
fclose(file);
}
tesseract::SquishedDawg::SquishedDawg ( EDGE_ARRAY  edges,
int  num_edges,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 370 of file dawg.h.

:
edges_(edges), num_edges_(num_edges) {
init(type, lang, perm, unicharset_size, debug_level);
num_forward_edges_in_node0 = num_forward_edges(0);
if (debug_level > 3) print_all("SquishedDawg:");
}
tesseract::SquishedDawg::~SquishedDawg ( )

Definition at line 181 of file dawg.cpp.

{ memfree(edges_); }

Member Function Documentation

EDGE_REF tesseract::SquishedDawg::edge_char_of ( NODE_REF  node,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
virtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 183 of file dawg.cpp.

{
EDGE_REF edge = node;
if (node == 0) { // binary search
EDGE_REF start = 0;
EDGE_REF end = num_forward_edges_in_node0 - 1;
int compare;
while (start <= end) {
edge = (start + end) >> 1; // (start + end) / 2
compare = given_greater_than_edge_rec(NO_EDGE, word_end,
unichar_id, edges_[edge]);
if (compare == 0) { // given == vec[k]
return edge;
} else if (compare == 1) { // given > vec[k]
start = edge + 1;
} else { // given < vec[k]
end = edge - 1;
}
}
} else { // linear search
if (edge != NO_EDGE && edge_occupied(edge)) {
do {
if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
(!word_end || end_of_word_from_edge_rec(edges_[edge])))
return (edge);
} while (!last_edge(edge++));
}
}
return (NO_EDGE); // not found
}
UNICHAR_ID tesseract::SquishedDawg::edge_letter ( EDGE_REF  edge_ref) const
inlinevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 410 of file dawg.h.

{
return unichar_id_from_edge_rec((edges_[edge_ref]));
}
bool tesseract::SquishedDawg::end_of_word ( EDGE_REF  edge_ref) const
inlinevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 405 of file dawg.h.

{
return end_of_word_from_edge_rec((edges_[edge_ref]));
}
NODE_REF tesseract::SquishedDawg::next_node ( EDGE_REF  edge) const
inlinevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 399 of file dawg.h.

{
return next_node_from_edge_rec((edges_[edge]));
}
int tesseract::SquishedDawg::NumEdges ( )
inline

Definition at line 380 of file dawg.h.

{ return num_edges_; }
void tesseract::SquishedDawg::print_node ( NODE_REF  node,
int  max_num_edges 
) const
virtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 228 of file dawg.cpp.

{
if (node == NO_EDGE) return; // nothing to print
EDGE_REF edge = node;
const char *forward_string = "FORWARD";
const char *backward_string = " ";
const char *last_string = "LAST";
const char *not_last_string = " ";
const char *eow_string = "EOW";
const char *not_eow_string = " ";
const char *direction;
const char *is_last;
const char *eow;
UNICHAR_ID unichar_id;
if (edge_occupied(edge)) {
do {
direction =
forward_edge(edge) ? forward_string : backward_string;
is_last = last_edge(edge) ? last_string : not_last_string;
eow = end_of_word(edge) ? eow_string : not_eow_string;
unichar_id = edge_letter(edge);
tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
edge, next_node(edge), unichar_id,
direction, is_last, eow);
if (edge - node > max_num_edges) return;
} while (!last_edge(edge++));
if (edge < num_edges_ &&
edge_occupied(edge) && backward_edge(edge)) {
do {
direction =
forward_edge(edge) ? forward_string : backward_string;
is_last = last_edge(edge) ? last_string : not_last_string;
eow = end_of_word(edge) ? eow_string : not_eow_string;
unichar_id = edge_letter(edge);
tprintf(REFFORMAT " : next = " REFFORMAT
", unichar_id = %d, %s %s %s\n",
edge, next_node(edge), unichar_id,
direction, is_last, eow);
if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
} while (!last_edge(edge++));
}
}
else {
tprintf(REFFORMAT " : no edges in this node\n", node);
}
tprintf("\n");
}
void tesseract::SquishedDawg::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec 
) const
inlinevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 388 of file dawg.h.

{
EDGE_REF edge = node;
if (!edge_occupied(edge) || edge == NO_EDGE) return;
assert(forward_edge(edge)); // we don't expect any backward edges to
do { // be present when this funciton is called
vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
} while (!last_edge(edge++));
}
void tesseract::SquishedDawg::write_squished_dawg ( FILE *  file)

Writes the squished/reduced Dawg to a file.

Definition at line 369 of file dawg.cpp.

{
EDGE_REF edge;
inT32 num_edges;
inT32 node_count = 0;
NODE_MAP node_map;
EDGE_REF old_index;
EDGE_RECORD temp_record;
if (debug_level_) tprintf("write_squished_dawg\n");
node_map = build_node_map(&node_count);
// Write the magic number to help detecting a change in endianness.
fwrite(&magic, sizeof(inT16), 1, file);
fwrite(&unicharset_size_, sizeof(inT32), 1, file);
// Count the number of edges in this Dawg.
num_edges = 0;
for (edge=0; edge < num_edges_; edge++)
if (forward_edge(edge))
num_edges++;
fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file
if (debug_level_) {
tprintf("%d nodes in DAWG\n", node_count);
tprintf("%d edges in DAWG\n", num_edges);
}
for (edge = 0; edge < num_edges_; edge++) {
if (forward_edge(edge)) { // write forward edges
do {
old_index = next_node_from_edge_rec(edges_[edge]);
set_next_node(edge, node_map[old_index]);
temp_record = edges_[edge];
fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file);
set_next_node(edge, old_index);
} while (!last_edge(edge++));
if (edge >= num_edges_) break;
if (backward_edge(edge)) // skip back links
while (!last_edge(edge++));
edge--;
}
}
free(node_map);
}
void tesseract::SquishedDawg::write_squished_dawg ( const char *  filename)
inline

Opens the file with the given filename and writes the squished/reduced Dawg to the file.

Definition at line 423 of file dawg.h.

{
FILE *file = fopen(filename, "wb");
if (file == NULL) {
tprintf("Error opening %s\n", filename);
exit(1);
}
this->write_squished_dawg(file);
fclose(file);
}

The documentation for this class was generated from the following files: