# Copyright 2009 Karhea Oy. # Author: Mika Raento mikie@iki.fi # # Licensed as FREEWARE: you may use and distribute unmodified and modified # copies of this work as you wish. # # boxes.pl # # An example on how to produce synthetic testing data for tesseract. # # Reads text from standard input, draw onto a cairo surface and saves as # output2.ong. Prints tesseract training box data to standard output. # # Input and output are utf-8. # # Character and line spacing is exaggarated as recommended in the tesseract # training documentation. use strict; use Cairo; use Data::Dumper; use POSIX qw(ceil floor); my $dpi = 300; my $w = 8 * $dpi; my $h = 12 * $dpi; my $fontsize = 10 * $dpi / 72; my $line_height = $fontsize * 2; my $spacing = 10; my $surface = Cairo::ImageSurface->create('argb32', $w, $h); my $cr = Cairo::Context->create($surface); $cr->select_font_face("Helvetica", "normal", "normal"); $cr->set_font_size($fontsize); # Convert cairo rectangle (top-left is 0, 0 - 3rd and 4th values are width # and height) to tesseract box (bottom-left is 0, 0 - 3rd and 4th values are # co-ordinates of second corner). sub to_tess_box { my $orig = shift; my @box = @$orig; $box[2] = $box[0] + $box[2]; $box[3] = $h - $orig->[1]; $box[1] = $box[3] - $orig->[3]; $box[0] = floor($box[0]); $box[1] = floor($box[1]); $box[2] = ceil($box[2]); $box[3] = ceil($box[3]); return \@box; } $cr->rectangle(0, 0, $w, $h); $cr->set_source_rgb (1, 1, 1); $cr->fill; my $line = 0; while() { chop; s/\r//; $line++; utf8::decode($_); my $y = $line * $line_height; my $x = 10; foreach my $c (split('', $_)) { utf8::encode($c); my $ext = $cr->text_extents($c); my $bbox = [ $x + $ext->{x_bearing}, $y + $ext->{y_bearing}, $ext->{width}, $ext->{height}, ]; if (0) { # draw boxes for visual inspection $cr->rectangle(@$bbox); $cr->set_source_rgb (1, 1, 1); $cr->fill; } $cr->move_to($x, $y); $cr->set_source_rgb (0, 0, 0); $cr->show_text($c); $x += $ext->{x_advance} + $spacing; if ($c ne ' ') { my $tess_box = to_tess_box($bbox); print $c, " ", join(" ", @$tess_box), "\n"; } } } $cr->show_page; $surface->write_to_png ("output2.png");