Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e413823

Browse files
committedJul 13, 2020
[RELEASE] iText 7 pdfOcr - 1.0.1
https://git.itextsupport.com/ * release/1.0.1: [RELEASE] 1.0.1-SNAPSHOT -> 1.0.1 Synchronized collections to avoid test failures Remove unused fields Improvements in word bbox calculation Depend on iText snapshot. Remove redundant methods from ReflectionUtils [RELEASE] Update dependency versions Refactor the path to tessdata so that it doesn't end with a slash Remove irrelevant comment Minor Javadoc fix [RELEASE] Update dependency versions
2 parents f271a87 + 1d10246 commit e413823

File tree

22 files changed

+330
-154
lines changed

22 files changed

+330
-154
lines changed
 

‎pdfocr-api/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<groupId>com.itextpdf</groupId>
77
<artifactId>pdfocr-root</artifactId>
8-
<version>1.0.0</version>
8+
<version>1.0.1</version>
99
</parent>
1010

1111
<artifactId>pdfocr-api</artifactId>

‎pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ public final OcrPdfCreatorProperties setTitle(
282282

283283
/**
284284
* Returns FontProvider that was set previously or if it is
285-
* <code>null<code/> a new instance of {@link PdfOcrFontProvider} is
285+
* <code>null</code> a new instance of {@link PdfOcrFontProvider} is
286286
* returned.
287287
* @return {@link com.itextpdf.layout.font.FontProvider} object
288288
*/

‎pdfocr-tesseract4/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<groupId>com.itextpdf</groupId>
77
<artifactId>pdfocr-root</artifactId>
8-
<version>1.0.0</version>
8+
<version>1.0.1</version>
99
</parent>
1010

1111
<artifactId>pdfocr-tesseract4</artifactId>

‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java

Lines changed: 0 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -22,58 +22,25 @@ This file is part of the iText (R) project.
2222
*/
2323
package com.itextpdf.pdfocr.tesseract4;
2424

25-
import com.itextpdf.io.util.MessageFormatUtil;
2625
import com.itextpdf.kernel.Version;
27-
import com.itextpdf.kernel.counter.ContextManager;
2826

29-
import java.lang.reflect.AccessibleObject;
3027
import java.lang.reflect.Array;
3128
import java.lang.reflect.Constructor;
3229
import java.lang.reflect.Method;
3330
import java.util.Arrays;
34-
import java.util.Collection;
35-
import java.util.Collections;
36-
import java.util.HashMap;
37-
import java.util.Map;
38-
import org.slf4j.Logger;
39-
import org.slf4j.LoggerFactory;
4031

4132
final class ReflectionUtils {
4233

43-
private static final Logger logger = LoggerFactory.getLogger(ReflectionUtils.class);
44-
45-
private static final String KERNEL_PACKAGE = "com.itextpdf.kernel.";
4634
private static final String LICENSEKEY_PACKAGE = "com.itextpdf.licensekey.";
4735

48-
private static final String CONTEXT_MANAGER = "counter.ContextManager";
4936
private static final String LICENSEKEY = "LicenseKey";
5037
private static final String LICENSEKEY_PRODUCT = "LicenseKeyProduct";
5138
private static final String LICENSEKEY_FEATURE = "LicenseKeyProductFeature";
5239

53-
private static final String REGISTER_GENERIC_CONTEXT = "registerGenericContext";
5440
private static final String SCHEDULED_CHECK = "scheduledCheck";
5541

5642
private static final String NO_PDFOCR_TESSERACT4 = "No license loaded for product pdfOcr-Tesseract4. Please use LicenseKey.loadLicense(...) to load one.";
5743

58-
private static Map<String, Class<?>> cachedClasses = new HashMap<>();
59-
private static Map<MethodSignature, AccessibleObject> cachedMethods = new HashMap<>();
60-
61-
static {
62-
try {
63-
ContextManager contextManager = ContextManager.getInstance();
64-
callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager,
65-
new Class[] {Collection.class, Collection.class},
66-
Collections.singletonList("com.itextpdf.pdfocr"),
67-
Collections.singletonList("com.itextpdf.pdfocr.tesseract4"));
68-
callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager,
69-
new Class[] {Collection.class, Collection.class},
70-
Collections.singletonList("com.itextpdf.pdfocr.tesseract4"),
71-
Collections.singletonList("com.itextpdf.pdfocr.tesseract4"));
72-
} catch (Exception e) {
73-
logger.error(e.getMessage());
74-
}
75-
}
76-
7744
private ReflectionUtils() {
7845
}
7946

@@ -116,52 +83,6 @@ public static void scheduledCheck() {
11683
}
11784
}
11885

119-
private static Object callMethod(String className, String methodName, Object target, Class[] parameterTypes,
120-
Object... args) {
121-
try {
122-
Method method = findMethod(className, methodName, parameterTypes);
123-
return method.invoke(target, args);
124-
} catch (NoSuchMethodException e) {
125-
logger.warn(MessageFormatUtil.format("Cannot find method {0} for class {1}", methodName, className));
126-
} catch (ClassNotFoundException e) {
127-
logger.warn(MessageFormatUtil.format("Cannot find class {0}", className));
128-
} catch (IllegalArgumentException e) {
129-
logger.warn(MessageFormatUtil
130-
.format("Illegal arguments passed to {0}#{1} method call: {2}", className, methodName,
131-
e.getMessage()));
132-
} catch (Exception e) {
133-
// Converting checked exceptions to unchecked RuntimeException (java-specific comment).
134-
//
135-
// If kernel utils throws an exception at this point, we consider it as unrecoverable situation for
136-
// its callers (pdfOcr methods).
137-
// It's might be more suitable to wrap checked exceptions at a bit higher level, but we do it here for
138-
// the sake of convenience.
139-
throw new RuntimeException(e.toString(), e);
140-
}
141-
return null;
142-
}
143-
144-
private static Method findMethod(String className, String methodName, Class[] parameterTypes)
145-
throws NoSuchMethodException, ClassNotFoundException {
146-
MethodSignature tm = new MethodSignature(className, parameterTypes, methodName);
147-
Method m = (Method) cachedMethods.get(tm);
148-
if (m == null) {
149-
m = findClass(className).getDeclaredMethod(methodName, parameterTypes);
150-
m.setAccessible(true);
151-
cachedMethods.put(tm, m);
152-
}
153-
return m;
154-
}
155-
156-
private static Class<?> findClass(String className) throws ClassNotFoundException {
157-
Class<?> c = cachedClasses.get(className);
158-
if (c == null) {
159-
c = getClass(className);
160-
cachedClasses.put(className, c);
161-
}
162-
return c;
163-
}
164-
16586
private static Class<?> getClass(String className) throws ClassNotFoundException {
16687
return Class.forName(className);
16788
}

‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,10 @@ public class Tesseract4LogMessageConstant {
5959
+ "temporary directory: {0}";
6060
public static final String CANNOT_CONVERT_IMAGE_TO_PIX =
6161
"Cannot convert image to pix: {0}";
62+
public static final String CANNOT_PARSE_NODE_BBOX =
63+
"Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}";
64+
6265

6366
private Tesseract4LogMessageConstant() {
6467
}
65-
}
68+
}

‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java

Lines changed: 121 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ This file is part of the iText (R) project.
2828
import com.itextpdf.styledxmlparser.jsoup.Jsoup;
2929
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
3030
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
31+
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
3132
import com.itextpdf.styledxmlparser.jsoup.select.Elements;
3233

3334
import java.io.File;
@@ -60,6 +61,27 @@ public class TesseractHelper {
6061
private static final Logger LOGGER = LoggerFactory
6162
.getLogger(TesseractHelper.class);
6263

64+
/**
65+
* Patterns for matching hOCR element bboxes.
66+
*/
67+
private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
68+
private static final Pattern BBOX_COORDINATE_PATTERN = Pattern
69+
.compile(
70+
".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
71+
72+
/**
73+
* Indices in array representing bbox.
74+
*/
75+
private static final int LEFT_IDX = 0;
76+
private static final int BOTTOM_IDX = 1;
77+
private static final int RIGHT_IDX = 2;
78+
private static final int TOP_IDX = 3;
79+
80+
/**
81+
* Size of the array containing bbox.
82+
*/
83+
private static final int BBOX_ARRAY_SIZE = 4;
84+
6385
/**
6486
* Creates a new {@link TesseractHelper} instance.
6587
*/
@@ -86,23 +108,20 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
86108
throws IOException {
87109
Map<Integer, List<TextInfo>> imageData =
88110
new LinkedHashMap<Integer, List<TextInfo>>();
111+
Map<String, Node> unparsedBBoxes = new LinkedHashMap<>();
89112

90113
for (File inputFile : inputFiles) {
91114
if (inputFile != null
92115
&& Files.exists(
93-
java.nio.file.Paths
94-
.get(inputFile.getAbsolutePath()))) {
116+
java.nio.file.Paths
117+
.get(inputFile.getAbsolutePath()))) {
95118
FileInputStream fileInputStream =
96119
new FileInputStream(inputFile.getAbsolutePath());
97120
Document doc = Jsoup.parse(fileInputStream,
98121
java.nio.charset.StandardCharsets.UTF_8.name(),
99122
inputFile.getAbsolutePath());
100123
Elements pages = doc.getElementsByClass("ocr_page");
101124

102-
Pattern bboxPattern = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
103-
Pattern bboxCoordinatePattern = Pattern
104-
.compile(
105-
".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
106125
List<String> searchedClasses = TextPositioning.BY_LINES
107126
.equals(textPositioning)
108127
? Arrays.<String>asList("ocr_line", "ocr_caption")
@@ -124,26 +143,11 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
124143
}
125144
}
126145
for (Element obj : objects) {
127-
String value = obj.attr("title");
128-
Matcher bboxMatcher = bboxPattern.matcher(value);
129-
if (bboxMatcher.matches()) {
130-
Matcher bboxCoordinateMatcher =
131-
bboxCoordinatePattern
132-
.matcher(bboxMatcher.group());
133-
if (bboxCoordinateMatcher.matches()) {
134-
List<Float> coordinates =
135-
new ArrayList<Float>();
136-
for (int i = 0; i < 4; i++) {
137-
String coord = bboxCoordinateMatcher
138-
.group(i + 1);
139-
coordinates
140-
.add(Float.parseFloat(coord));
141-
}
142-
143-
textData.add(new TextInfo(obj.text(),
144-
coordinates));
145-
}
146-
}
146+
List<Float> coordinates = getAlignedBBox(obj,
147+
textPositioning,
148+
unparsedBBoxes);
149+
textData.add(new TextInfo(obj.text(),
150+
coordinates));
147151
}
148152
}
149153
if (textData.size() > 0) {
@@ -157,9 +161,97 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
157161
fileInputStream.close();
158162
}
159163
}
164+
for (Node node : unparsedBBoxes.values()) {
165+
LOGGER.warn(MessageFormatUtil.format(
166+
Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX,
167+
node.toString()
168+
));
169+
}
160170
return imageData;
161171
}
162172

173+
/**
174+
* Get and align (if needed) bbox of the element.
175+
*/
176+
static List<Float> getAlignedBBox(Element object,
177+
TextPositioning textPositioning,
178+
Map<String, Node> unparsedBBoxes) {
179+
final List<Float> coordinates = parseBBox(object, unparsedBBoxes);
180+
if (TextPositioning.BY_WORDS_AND_LINES == textPositioning
181+
|| TextPositioning.BY_WORDS == textPositioning) {
182+
Node line = object.parent();
183+
final List<Float> lineCoordinates = parseBBox(line, unparsedBBoxes);
184+
if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) {
185+
coordinates.set(BOTTOM_IDX, lineCoordinates.get(BOTTOM_IDX));
186+
coordinates.set(TOP_IDX, lineCoordinates.get(TOP_IDX));
187+
}
188+
detectAndFixBrokenBBoxes(object, coordinates,
189+
lineCoordinates, unparsedBBoxes);
190+
}
191+
return coordinates;
192+
}
193+
194+
/**
195+
* Parses element bbox.
196+
*
197+
* @param node element containing bbox
198+
* @param unparsedBBoxes list of element ids with bboxes which could not be parsed
199+
* @return parsed bbox
200+
*/
201+
static List<Float> parseBBox(Node node, Map<String, Node> unparsedBBoxes) {
202+
List<Float> bbox = new ArrayList<>();
203+
Matcher bboxMatcher = BBOX_PATTERN.matcher(node.attr("title"));
204+
if (bboxMatcher.matches()) {
205+
Matcher bboxCoordinateMatcher =
206+
BBOX_COORDINATE_PATTERN
207+
.matcher(bboxMatcher.group());
208+
if (bboxCoordinateMatcher.matches()) {
209+
for (int i = 0; i < BBOX_ARRAY_SIZE; i++) {
210+
String coord = bboxCoordinateMatcher
211+
.group(i + 1);
212+
bbox.add(Float.parseFloat(coord));
213+
}
214+
}
215+
}
216+
if (bbox.size() == 0) {
217+
bbox = Arrays.asList(0f, 0f, 0f, 0f);
218+
String id = node.attr("id");
219+
if (id != null && !unparsedBBoxes.containsKey(id)) {
220+
unparsedBBoxes.put(id, node);
221+
}
222+
}
223+
return bbox;
224+
}
225+
226+
/**
227+
* Sometimes hOCR file contains broke character bboxes which are equal to page bbox.
228+
* This method attempts to detect and fix them.
229+
*/
230+
static void detectAndFixBrokenBBoxes(Element object, List<Float> coordinates,
231+
List<Float> lineCoordinates,
232+
Map<String, Node> unparsedBBoxes) {
233+
if (coordinates.get(LEFT_IDX) < lineCoordinates.get(LEFT_IDX)
234+
|| coordinates.get(LEFT_IDX) > lineCoordinates.get(RIGHT_IDX)) {
235+
if (object.previousElementSibling() == null) {
236+
coordinates.set(LEFT_IDX, lineCoordinates.get(LEFT_IDX));
237+
} else {
238+
Element sibling = object.previousElementSibling();
239+
List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
240+
coordinates.set(LEFT_IDX, siblingBBox.get(RIGHT_IDX));
241+
}
242+
}
243+
if (coordinates.get(RIGHT_IDX) > lineCoordinates.get(RIGHT_IDX)
244+
|| coordinates.get(RIGHT_IDX) < lineCoordinates.get(LEFT_IDX)) {
245+
if (object.nextElementSibling() == null) {
246+
coordinates.set(RIGHT_IDX, lineCoordinates.get(RIGHT_IDX));
247+
} else {
248+
Element sibling = object.nextElementSibling();
249+
List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
250+
coordinates.set(RIGHT_IDX, siblingBBox.get(LEFT_IDX));
251+
}
252+
}
253+
}
254+
163255
/**
164256
* Deletes file using provided path.
165257
*
@@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) {
208300
* @param data text data in required format as {@link java.lang.String}
209301
*/
210302
static void writeToTextFile(final String path,
211-
final String data) {
303+
final String data) {
212304
try (Writer writer = new OutputStreamWriter(new FileOutputStream(path),
213305
StandardCharsets.UTF_8)) {
214306
writer.write(data);
@@ -228,7 +320,7 @@ static void writeToTextFile(final String path,
228320
* @throws Tesseract4OcrException if provided command failed
229321
*/
230322
static void runCommand(final String execPath,
231-
final List<String> paramsList) throws Tesseract4OcrException {
323+
final List<String> paramsList) throws Tesseract4OcrException {
232324
try {
233325
String params = String.join(" ", paramsList);
234326
boolean cmdSucceeded = SystemUtil
@@ -251,4 +343,4 @@ static void runCommand(final String execPath,
251343
.TESSERACT_FAILED);
252344
}
253345
}
254-
}
346+
}

‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,9 @@ public enum TextPositioning {
3939
/**
4040
* Text will be located by words retrieved from hocr file.
4141
*/
42-
BY_WORDS
43-
}
42+
BY_WORDS,
43+
/**
44+
* Similar to BY_WORDS mode, but top and bottom of word BBox are inherited from line.
45+
*/
46+
BY_WORDS_AND_LINES,
47+
}

0 commit comments

Comments
 (0)
Please sign in to comment.