@@ -28,6 +28,7 @@ This file is part of the iText (R) project.
28
28
import com .itextpdf .styledxmlparser .jsoup .Jsoup ;
29
29
import com .itextpdf .styledxmlparser .jsoup .nodes .Document ;
30
30
import com .itextpdf .styledxmlparser .jsoup .nodes .Element ;
31
+ import com .itextpdf .styledxmlparser .jsoup .nodes .Node ;
31
32
import com .itextpdf .styledxmlparser .jsoup .select .Elements ;
32
33
33
34
import java .io .File ;
@@ -60,6 +61,27 @@ public class TesseractHelper {
60
61
private static final Logger LOGGER = LoggerFactory
61
62
.getLogger (TesseractHelper .class );
62
63
64
+ /**
65
+ * Patterns for matching hOCR element bboxes.
66
+ */
67
+ private static final Pattern BBOX_PATTERN = Pattern .compile (".*bbox(\\ s+\\ d+){4}.*" );
68
+ private static final Pattern BBOX_COORDINATE_PATTERN = Pattern
69
+ .compile (
70
+ ".*\\ s+(\\ d+)\\ s+(\\ d+)\\ s+(\\ d+)\\ s+(\\ d+).*" );
71
+
72
+ /**
73
+ * Indices in array representing bbox.
74
+ */
75
+ private static final int LEFT_IDX = 0 ;
76
+ private static final int BOTTOM_IDX = 1 ;
77
+ private static final int RIGHT_IDX = 2 ;
78
+ private static final int TOP_IDX = 3 ;
79
+
80
+ /**
81
+ * Size of the array containing bbox.
82
+ */
83
+ private static final int BBOX_ARRAY_SIZE = 4 ;
84
+
63
85
/**
64
86
* Creates a new {@link TesseractHelper} instance.
65
87
*/
@@ -86,23 +108,20 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
86
108
throws IOException {
87
109
Map <Integer , List <TextInfo >> imageData =
88
110
new LinkedHashMap <Integer , List <TextInfo >>();
111
+ Map <String , Node > unparsedBBoxes = new LinkedHashMap <>();
89
112
90
113
for (File inputFile : inputFiles ) {
91
114
if (inputFile != null
92
115
&& Files .exists (
93
- java .nio .file .Paths
94
- .get (inputFile .getAbsolutePath ()))) {
116
+ java .nio .file .Paths
117
+ .get (inputFile .getAbsolutePath ()))) {
95
118
FileInputStream fileInputStream =
96
119
new FileInputStream (inputFile .getAbsolutePath ());
97
120
Document doc = Jsoup .parse (fileInputStream ,
98
121
java .nio .charset .StandardCharsets .UTF_8 .name (),
99
122
inputFile .getAbsolutePath ());
100
123
Elements pages = doc .getElementsByClass ("ocr_page" );
101
124
102
- Pattern bboxPattern = Pattern .compile (".*bbox(\\ s+\\ d+){4}.*" );
103
- Pattern bboxCoordinatePattern = Pattern
104
- .compile (
105
- ".*\\ s+(\\ d+)\\ s+(\\ d+)\\ s+(\\ d+)\\ s+(\\ d+).*" );
106
125
List <String > searchedClasses = TextPositioning .BY_LINES
107
126
.equals (textPositioning )
108
127
? Arrays .<String >asList ("ocr_line" , "ocr_caption" )
@@ -124,26 +143,11 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
124
143
}
125
144
}
126
145
for (Element obj : objects ) {
127
- String value = obj .attr ("title" );
128
- Matcher bboxMatcher = bboxPattern .matcher (value );
129
- if (bboxMatcher .matches ()) {
130
- Matcher bboxCoordinateMatcher =
131
- bboxCoordinatePattern
132
- .matcher (bboxMatcher .group ());
133
- if (bboxCoordinateMatcher .matches ()) {
134
- List <Float > coordinates =
135
- new ArrayList <Float >();
136
- for (int i = 0 ; i < 4 ; i ++) {
137
- String coord = bboxCoordinateMatcher
138
- .group (i + 1 );
139
- coordinates
140
- .add (Float .parseFloat (coord ));
141
- }
142
-
143
- textData .add (new TextInfo (obj .text (),
144
- coordinates ));
145
- }
146
- }
146
+ List <Float > coordinates = getAlignedBBox (obj ,
147
+ textPositioning ,
148
+ unparsedBBoxes );
149
+ textData .add (new TextInfo (obj .text (),
150
+ coordinates ));
147
151
}
148
152
}
149
153
if (textData .size () > 0 ) {
@@ -157,9 +161,97 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
157
161
fileInputStream .close ();
158
162
}
159
163
}
164
+ for (Node node : unparsedBBoxes .values ()) {
165
+ LOGGER .warn (MessageFormatUtil .format (
166
+ Tesseract4LogMessageConstant .CANNOT_PARSE_NODE_BBOX ,
167
+ node .toString ()
168
+ ));
169
+ }
160
170
return imageData ;
161
171
}
162
172
173
+ /**
174
+ * Get and align (if needed) bbox of the element.
175
+ */
176
+ static List <Float > getAlignedBBox (Element object ,
177
+ TextPositioning textPositioning ,
178
+ Map <String , Node > unparsedBBoxes ) {
179
+ final List <Float > coordinates = parseBBox (object , unparsedBBoxes );
180
+ if (TextPositioning .BY_WORDS_AND_LINES == textPositioning
181
+ || TextPositioning .BY_WORDS == textPositioning ) {
182
+ Node line = object .parent ();
183
+ final List <Float > lineCoordinates = parseBBox (line , unparsedBBoxes );
184
+ if (TextPositioning .BY_WORDS_AND_LINES == textPositioning ) {
185
+ coordinates .set (BOTTOM_IDX , lineCoordinates .get (BOTTOM_IDX ));
186
+ coordinates .set (TOP_IDX , lineCoordinates .get (TOP_IDX ));
187
+ }
188
+ detectAndFixBrokenBBoxes (object , coordinates ,
189
+ lineCoordinates , unparsedBBoxes );
190
+ }
191
+ return coordinates ;
192
+ }
193
+
194
+ /**
195
+ * Parses element bbox.
196
+ *
197
+ * @param node element containing bbox
198
+ * @param unparsedBBoxes list of element ids with bboxes which could not be parsed
199
+ * @return parsed bbox
200
+ */
201
+ static List <Float > parseBBox (Node node , Map <String , Node > unparsedBBoxes ) {
202
+ List <Float > bbox = new ArrayList <>();
203
+ Matcher bboxMatcher = BBOX_PATTERN .matcher (node .attr ("title" ));
204
+ if (bboxMatcher .matches ()) {
205
+ Matcher bboxCoordinateMatcher =
206
+ BBOX_COORDINATE_PATTERN
207
+ .matcher (bboxMatcher .group ());
208
+ if (bboxCoordinateMatcher .matches ()) {
209
+ for (int i = 0 ; i < BBOX_ARRAY_SIZE ; i ++) {
210
+ String coord = bboxCoordinateMatcher
211
+ .group (i + 1 );
212
+ bbox .add (Float .parseFloat (coord ));
213
+ }
214
+ }
215
+ }
216
+ if (bbox .size () == 0 ) {
217
+ bbox = Arrays .asList (0f , 0f , 0f , 0f );
218
+ String id = node .attr ("id" );
219
+ if (id != null && !unparsedBBoxes .containsKey (id )) {
220
+ unparsedBBoxes .put (id , node );
221
+ }
222
+ }
223
+ return bbox ;
224
+ }
225
+
226
+ /**
227
+ * Sometimes hOCR file contains broke character bboxes which are equal to page bbox.
228
+ * This method attempts to detect and fix them.
229
+ */
230
+ static void detectAndFixBrokenBBoxes (Element object , List <Float > coordinates ,
231
+ List <Float > lineCoordinates ,
232
+ Map <String , Node > unparsedBBoxes ) {
233
+ if (coordinates .get (LEFT_IDX ) < lineCoordinates .get (LEFT_IDX )
234
+ || coordinates .get (LEFT_IDX ) > lineCoordinates .get (RIGHT_IDX )) {
235
+ if (object .previousElementSibling () == null ) {
236
+ coordinates .set (LEFT_IDX , lineCoordinates .get (LEFT_IDX ));
237
+ } else {
238
+ Element sibling = object .previousElementSibling ();
239
+ List <Float > siblingBBox = parseBBox (sibling , unparsedBBoxes );
240
+ coordinates .set (LEFT_IDX , siblingBBox .get (RIGHT_IDX ));
241
+ }
242
+ }
243
+ if (coordinates .get (RIGHT_IDX ) > lineCoordinates .get (RIGHT_IDX )
244
+ || coordinates .get (RIGHT_IDX ) < lineCoordinates .get (LEFT_IDX )) {
245
+ if (object .nextElementSibling () == null ) {
246
+ coordinates .set (RIGHT_IDX , lineCoordinates .get (RIGHT_IDX ));
247
+ } else {
248
+ Element sibling = object .nextElementSibling ();
249
+ List <Float > siblingBBox = parseBBox (sibling , unparsedBBoxes );
250
+ coordinates .set (RIGHT_IDX , siblingBBox .get (LEFT_IDX ));
251
+ }
252
+ }
253
+ }
254
+
163
255
/**
164
256
* Deletes file using provided path.
165
257
*
@@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) {
208
300
* @param data text data in required format as {@link java.lang.String}
209
301
*/
210
302
static void writeToTextFile (final String path ,
211
- final String data ) {
303
+ final String data ) {
212
304
try (Writer writer = new OutputStreamWriter (new FileOutputStream (path ),
213
305
StandardCharsets .UTF_8 )) {
214
306
writer .write (data );
@@ -228,7 +320,7 @@ static void writeToTextFile(final String path,
228
320
* @throws Tesseract4OcrException if provided command failed
229
321
*/
230
322
static void runCommand (final String execPath ,
231
- final List <String > paramsList ) throws Tesseract4OcrException {
323
+ final List <String > paramsList ) throws Tesseract4OcrException {
232
324
try {
233
325
String params = String .join (" " , paramsList );
234
326
boolean cmdSucceeded = SystemUtil
@@ -251,4 +343,4 @@ static void runCommand(final String execPath,
251
343
.TESSERACT_FAILED );
252
344
}
253
345
}
254
- }
346
+ }
0 commit comments