Skip to content

Commit 175130f

Browse files
alexander.kischukSnipx
alexander.kischuk
authored andcommitted
Hide possibility to set userWords
PDFOC-95
1 parent 536fe6a commit 175130f

File tree

5 files changed

+243
-99
lines changed

5 files changed

+243
-99
lines changed

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrEngineProperties.java

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -239,14 +239,18 @@ public final Tesseract4OcrEngineProperties setTextPositioning(
239239
* ends with a new line character. Train data for provided language
240240
* should exist in specified tess data directory.
241241
*
242+
* NOTE:
243+
* User words dictionary doesn't work properly in tesseract4
244+
* and hidden for public usage until fix is available
245+
*
242246
* @param language language as {@link java.lang.String}, tessdata for
243247
* this languages has to exist in tess data directory
244248
* @param userWords {@link java.util.List} of custom words
245249
* @return the {@link Tesseract4OcrEngineProperties} instance
246250
* @throws Tesseract4OcrException if one of given languages wasn't specified in the
247251
* list of required languages for OCR using
248252
*/
249-
public Tesseract4OcrEngineProperties setUserWords(final String language,
253+
Tesseract4OcrEngineProperties setUserWords(final String language,
250254
final List<String> userWords)
251255
throws Tesseract4OcrException {
252256
setPathToUserWordsFile(null);
@@ -281,6 +285,10 @@ public Tesseract4OcrEngineProperties setUserWords(final String language,
281285
* a new line character. Train data for provided language
282286
* should exist in specified tess data directory.
283287
*
288+
* NOTE:
289+
* User words dictionary doesn't work properly in tesseract4
290+
* and hidden for public usage until fix is available
291+
*
284292
* @param language language as {@link java.lang.String}, tessdata for
285293
* this languages has to exist in tess data directory
286294
* @param inputStream custom user words as {@link java.io.InputStream}
@@ -289,7 +297,7 @@ public Tesseract4OcrEngineProperties setUserWords(final String language,
289297
* {@link Tesseract4OcrEngineProperties#setLanguages(List)} method
290298
* @return the {@link Tesseract4OcrEngineProperties} instance
291299
*/
292-
public Tesseract4OcrEngineProperties setUserWords(final String language,
300+
Tesseract4OcrEngineProperties setUserWords(final String language,
293301
final InputStream inputStream) throws Tesseract4OcrException {
294302
setPathToUserWordsFile(null);
295303
if (!getLanguages().contains(language)) {
@@ -328,21 +336,29 @@ public Tesseract4OcrEngineProperties setUserWords(final String language,
328336
/**
329337
* Returns path to the user words file.
330338
*
339+
* NOTE:
340+
* User words dictionary doesn't work properly in tesseract4
341+
* and hidden for public usage until fix is available
342+
*
331343
* @return path to user words file as {@link java.lang.String} if it
332344
* exists, otherwise - null
333345
*/
334-
public final String getPathToUserWordsFile() {
346+
final String getPathToUserWordsFile() {
335347
return pathToUserWordsFile;
336348
}
337349

338350
/**
339351
* Sets path to the user words file.
340352
*
353+
* NOTE:
354+
* User words dictionary doesn't work properly in tesseract4
355+
* and hidden for public usage until fix is available
356+
*
341357
* @param pathToUserWordsFile path to user words file
342358
* as {@link java.lang.String}
343359
* @return the {@link Tesseract4OcrEngineProperties} instance
344360
*/
345-
public final Tesseract4OcrEngineProperties setPathToUserWordsFile(
361+
final Tesseract4OcrEngineProperties setPathToUserWordsFile(
346362
String pathToUserWordsFile) {
347363
return setPathToUserWordsFile(pathToUserWordsFile, false);
348364
}

pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationTest.java

Lines changed: 0 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ This file is part of the iText (R) project.
2222
*/
2323
package com.itextpdf.pdfocr.tessdata;
2424

25-
import com.itextpdf.io.util.MessageFormatUtil;
2625
import com.itextpdf.kernel.colors.DeviceCmyk;
2726
import com.itextpdf.kernel.pdf.PdfWriter;
2827
import com.itextpdf.kernel.utils.CompareTool;
@@ -32,14 +31,11 @@ This file is part of the iText (R) project.
3231
import com.itextpdf.pdfocr.PdfOcrLogMessageConstant;
3332
import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine;
3433
import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties;
35-
import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrException;
3634
import com.itextpdf.pdfocr.tesseract4.TextPositioning;
3735
import com.itextpdf.test.annotations.LogMessage;
3836
import com.itextpdf.test.annotations.LogMessages;
3937

4038
import java.io.File;
41-
import java.io.FileInputStream;
42-
import java.io.FileNotFoundException;
4339
import java.io.IOException;
4440
import java.nio.file.Files;
4541
import java.util.ArrayList;
@@ -627,97 +623,6 @@ public void testJapaneseScript() {
627623
Assert.assertEquals(expected, result);
628624
}
629625

630-
@Test
631-
public void testCustomUserWords() {
632-
String imgPath = TEST_IMAGES_DIRECTORY + "wierdwords.png";
633-
List<String> userWords = Arrays.<String>asList("he23llo", "qwetyrtyqpwe-rty");
634-
635-
Tesseract4OcrEngineProperties properties =
636-
tesseractReader.getTesseract4OcrEngineProperties();
637-
properties.setLanguages(Arrays.asList("fra"));
638-
properties.setUserWords("fra", userWords);
639-
tesseractReader.setTesseract4OcrEngineProperties(properties);
640-
String result = getRecognizedTextFromTextFile(tesseractReader, imgPath);
641-
Assert.assertTrue(result.contains(userWords.get(0))
642-
|| result.contains(userWords.get(1)));
643-
644-
Assert.assertTrue(tesseractReader.getTesseract4OcrEngineProperties()
645-
.getPathToUserWordsFile().endsWith(".user-words"));
646-
}
647-
648-
@Test
649-
public void testCustomUserWordsWithListOfLanguages() {
650-
String imgPath = TEST_IMAGES_DIRECTORY + "bogusText.jpg";
651-
String expectedOutput = "B1adeb1ab1a";
652-
653-
Tesseract4OcrEngineProperties properties =
654-
tesseractReader.getTesseract4OcrEngineProperties();
655-
properties.setLanguages(Arrays.asList("fra", "eng"));
656-
properties.setUserWords("eng", Arrays.<String>asList("b1adeb1ab1a"));
657-
tesseractReader.setTesseract4OcrEngineProperties(properties);
658-
659-
String result = getRecognizedTextFromTextFile(tesseractReader, imgPath);
660-
result = result.replace("\n", "").replace("\f", "");
661-
result = result.replaceAll("[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", "");
662-
Assert.assertTrue(result.startsWith(expectedOutput));
663-
664-
Assert.assertTrue(tesseractReader.getTesseract4OcrEngineProperties()
665-
.getPathToUserWordsFile().endsWith(".user-words"));
666-
}
667-
668-
@Test
669-
public void testUserWordsWithLanguageNotInList() throws FileNotFoundException {
670-
junitExpectedException.expect(Tesseract4OcrException.class);
671-
junitExpectedException.expectMessage(MessageFormatUtil
672-
.format(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST,
673-
"spa"));
674-
String userWords = TEST_DOCUMENTS_DIRECTORY + "userwords.txt";
675-
Tesseract4OcrEngineProperties properties =
676-
tesseractReader.getTesseract4OcrEngineProperties();
677-
properties.setUserWords("spa", new FileInputStream(userWords));
678-
properties.setLanguages(new ArrayList<String>());
679-
}
680-
681-
@Test
682-
public void testIncorrectLanguageForUserWordsAsList() {
683-
junitExpectedException.expect(Tesseract4OcrException.class);
684-
junitExpectedException.expectMessage(MessageFormatUtil
685-
.format(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST,
686-
"eng1"));
687-
Tesseract4OcrEngineProperties properties =
688-
tesseractReader.getTesseract4OcrEngineProperties();
689-
properties.setUserWords("eng1", Arrays.<String>asList("word1", "word2"));
690-
properties.setLanguages(new ArrayList<String>());
691-
}
692-
693-
@Test
694-
public void testUserWordsWithDefaultLanguageNotInList()
695-
throws FileNotFoundException {
696-
String userWords = TEST_DOCUMENTS_DIRECTORY + "userwords.txt";
697-
Tesseract4OcrEngineProperties properties =
698-
tesseractReader.getTesseract4OcrEngineProperties();
699-
properties.setUserWords("eng", new FileInputStream(userWords));
700-
properties.setLanguages(new ArrayList<String>());
701-
tesseractReader.setTesseract4OcrEngineProperties(properties);
702-
String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg";
703-
String expectedOutput = "619121";
704-
String result = getRecognizedTextFromTextFile(tesseractReader, imgPath);
705-
Assert.assertTrue(result.startsWith(expectedOutput));
706-
}
707-
708-
@Test
709-
public void testUserWordsFileNotDeleted() {
710-
String userWords = TEST_DOCUMENTS_DIRECTORY + "userwords.txt";
711-
Tesseract4OcrEngineProperties properties =
712-
tesseractReader.getTesseract4OcrEngineProperties();
713-
properties.setPathToUserWordsFile(userWords);
714-
properties.setLanguages(Arrays.<String>asList("eng"));
715-
tesseractReader.setTesseract4OcrEngineProperties(properties);
716-
String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg";
717-
tesseractReader.doImageOcr(new File(imgPath));
718-
Assert.assertTrue(new File(userWords).exists());
719-
}
720-
721626
/**
722627
* Do OCR for given image and compare result text file with expected one.
723628
*/
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
This file is part of the iText (R) project.
3+
Copyright (c) 1998-2020 iText Group NV
4+
Authors: iText Software.
5+
6+
This program is offered under a commercial and under the AGPL license.
7+
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
8+
9+
AGPL licensing:
10+
This program is free software: you can redistribute it and/or modify
11+
it under the terms of the GNU Affero General Public License as published by
12+
the Free Software Foundation, either version 3 of the License, or
13+
(at your option) any later version.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU Affero General Public License for more details.
19+
20+
You should have received a copy of the GNU Affero General Public License
21+
along with this program. If not, see <https://www.gnu.org/licenses/>.
22+
*/
23+
package com.itextpdf.pdfocr.tesseract4;
24+
25+
import com.itextpdf.test.annotations.type.IntegrationTest;
26+
import org.junit.experimental.categories.Category;
27+
28+
@Category(IntegrationTest.class)
29+
public class UserWordsExecutableTest extends UserWordsTest {
30+
public UserWordsExecutableTest() {
31+
super(ReaderType.EXECUTABLE);
32+
}
33+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
This file is part of the iText (R) project.
3+
Copyright (c) 1998-2020 iText Group NV
4+
Authors: iText Software.
5+
6+
This program is offered under a commercial and under the AGPL license.
7+
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
8+
9+
AGPL licensing:
10+
This program is free software: you can redistribute it and/or modify
11+
it under the terms of the GNU Affero General Public License as published by
12+
the Free Software Foundation, either version 3 of the License, or
13+
(at your option) any later version.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU Affero General Public License for more details.
19+
20+
You should have received a copy of the GNU Affero General Public License
21+
along with this program. If not, see <https://www.gnu.org/licenses/>.
22+
*/
23+
package com.itextpdf.pdfocr.tesseract4;
24+
25+
import com.itextpdf.test.annotations.type.IntegrationTest;
26+
import org.junit.experimental.categories.Category;
27+
28+
@Category(IntegrationTest.class)
29+
public class UserWordsLibTest extends UserWordsTest {
30+
public UserWordsLibTest() {
31+
super(ReaderType.LIB);
32+
}
33+
}

0 commit comments

Comments
 (0)