Skip to content

Commit e537e4e

Browse files
authored
Merge pull request #461 from JohnSnowLabs/ocr-class-python
OCR changes and bullets
2 parents 4d829f6 + 8466031 commit e537e4e

File tree

4 files changed

+64
-178
lines changed

4 files changed

+64
-178
lines changed

CHANGELOG

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,17 @@ Bugfixes
6262
* Fixed a bug where DateMatcher didn't know how to handle dash in dates where year had two digits instead of four
6363
* Fixed a ContextSpellChecker bug that prevented it from being used repeatedly with collections in LightPipeline
6464
* Fixed a bug in OCR that made it blow up with some image formats when using text preferred method
65+
* Fixed a bug on OCR which made params not to work in cluster mode
66+
* Fixed OCR setSplitPages and setSplitRegions to work properly if tesseract detected multiple regions
6567

6668
----------------
6769
Developer API
6870
----------------
6971
* AnnotatorType params renamed to inputAnnotatorTypes and outputAnnotatorTypes
7072
* Embeddings now serialize along a FloatArray in Annotation class
7173
* Disabled useFeatureBroadcasting, showed better performance number when training large models in annotators that use Features
74+
* OCR must be instantiated
75+
* OCR is now tested and main compatible with Tesseract 4.0.0
7276

7377
----------------
7478
Build and release

python/sparknlp/internal.py

Lines changed: 0 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -84,119 +84,6 @@ def __init__(self, pipelineModel):
8484
self._java_obj = self._new_java_obj(self._java_obj, pipelineModel._to_java())
8585

8686

87-
# ============
88-
# OCR SECTION
89-
# ============
90-
91-
92-
class _OcrCreateDataset(ExtendedJavaWrapper):
93-
def __init__(self, spark, input_path):
94-
super(_OcrCreateDataset, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.createDataset")
95-
self._java_obj = self._new_java_obj(self._java_obj, spark, input_path)
96-
97-
98-
class _OcrCreateMap(ExtendedJavaWrapper):
99-
def __init__(self, input_path):
100-
super(_OcrCreateMap, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.createMap")
101-
self._java_obj = self._new_java_obj(self._java_obj, input_path)
102-
103-
104-
class _OcrSetPreferredMethod(ExtendedJavaWrapper):
105-
def __init__(self, value):
106-
super(_OcrSetPreferredMethod, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.setPreferredMethod")
107-
self._java_obj = self._new_java_obj(self._java_obj, value)
108-
109-
110-
class _OcrGetPreferredMethod(ExtendedJavaWrapper):
111-
def __init__(self):
112-
super(_OcrGetPreferredMethod, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.getPreferredMethod")
113-
self._java_obj = self._new_java_obj(self._java_obj)
114-
115-
116-
class _OcrSetFallbackMethod(ExtendedJavaWrapper):
117-
def __init__(self, value):
118-
super(_OcrSetFallbackMethod, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.setFallbackMethod")
119-
self._java_obj = self._new_java_obj(self._java_obj, value)
120-
121-
122-
class _OcrGetFallbackMethod(ExtendedJavaWrapper):
123-
def __init__(self):
124-
super(_OcrGetFallbackMethod, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.getFallbackMethod")
125-
self._java_obj = self._new_java_obj(self._java_obj)
126-
127-
128-
class _OcrSetMinSizeBeforeFallback(ExtendedJavaWrapper):
129-
def __init__(self, value):
130-
super(_OcrSetMinSizeBeforeFallback, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.setMinSizeBeforeFallback")
131-
self._java_obj = self._new_java_obj(self._java_obj, value)
132-
133-
134-
class _OcrGetMinSizeBeforeFallback(ExtendedJavaWrapper):
135-
def __init__(self):
136-
super(_OcrGetMinSizeBeforeFallback, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.getMinSizeBeforeFallback")
137-
self._java_obj = self._new_java_obj(self._java_obj)
138-
139-
140-
class _OcrSetPageSegMode(ExtendedJavaWrapper):
141-
def __init__(self, value):
142-
super(_OcrSetPageSegMode, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.setPageSegMode")
143-
self._java_obj = self._new_java_obj(self._java_obj, value)
144-
145-
146-
class _OcrGetPageSegMode(ExtendedJavaWrapper):
147-
def __init__(self):
148-
super(_OcrGetPageSegMode, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.getPageSegMode")
149-
self._java_obj = self._new_java_obj(self._java_obj)
150-
151-
152-
class _OcrSetEngineMode(ExtendedJavaWrapper):
153-
def __init__(self, value):
154-
super(_OcrSetEngineMode, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.setEngineMode")
155-
self._java_obj = self._new_java_obj(self._java_obj, value)
156-
157-
158-
class _OcrGetEngineMode(ExtendedJavaWrapper):
159-
def __init__(self):
160-
super(_OcrGetEngineMode, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.getEngineMode")
161-
self._java_obj = self._new_java_obj(self._java_obj)
162-
163-
164-
class _OcrSetPageIteratorLevel(ExtendedJavaWrapper):
165-
def __init__(self, value):
166-
super(_OcrSetPageIteratorLevel, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.setPageIteratorLevel")
167-
self._java_obj = self._new_java_obj(self._java_obj, value)
168-
169-
170-
class _OcrGetPageIteratorLevel(ExtendedJavaWrapper):
171-
def __init__(self):
172-
super(_OcrGetPageIteratorLevel, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.getPageIteratorLevel")
173-
self._java_obj = self._new_java_obj(self._java_obj)
174-
175-
176-
class _OcrSetScalingFactor(ExtendedJavaWrapper):
177-
def __init__(self, value):
178-
super(_OcrSetScalingFactor, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.setScalingFactor")
179-
self._java_obj = self._new_java_obj(self._java_obj, value)
180-
181-
182-
class _OcrGetSplitPages(ExtendedJavaWrapper):
183-
def __init__(self):
184-
super(_OcrGetSplitPages, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.getSplitPages")
185-
self._java_obj = self._new_java_obj(self._java_obj)
186-
187-
188-
class _OcrSetSplitPages(ExtendedJavaWrapper):
189-
def __init__(self, value):
190-
super(_OcrSetSplitPages, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.setSplitPages")
191-
self._java_obj = self._new_java_obj(self._java_obj, value)
192-
193-
194-
class _OcrUseErosion(ExtendedJavaWrapper):
195-
def __init__(self, use, k_size, k_shape):
196-
super(_OcrUseErosion, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper.useErosion")
197-
self._java_obj = self._new_java_obj(self._java_obj, use, k_size, k_shape)
198-
199-
20087
# ==================
20188
# Utils
20289
# ==================

python/sparknlp/ocr.py

Lines changed: 49 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,71 @@
1-
import sparknlp.internal as _int
1+
from sparknlp.internal import ExtendedJavaWrapper
22
from pyspark.sql import SparkSession, DataFrame
33

44

5-
class OcrHelper:
6-
@staticmethod
7-
def createDataset(spark, input_path):
5+
class OcrHelper(ExtendedJavaWrapper):
6+
7+
def __init__(self):
8+
super(OcrHelper, self).__init__("com.johnsnowlabs.nlp.util.io.OcrHelper")
9+
self._java_obj = self._new_java_obj(self._java_obj)
10+
11+
def createDataset(self, spark, input_path):
812
if type(spark) != SparkSession:
913
raise Exception("spark must be SparkSession")
10-
return DataFrame(_int._OcrCreateDataset(spark._jsparkSession, input_path).apply(), spark)
14+
return DataFrame(self._java_obj.createDataset(spark._jsparkSession, input_path), spark)
15+
16+
def createMap(self, input_path):
17+
return self._java_obj.createMap(input_path)
18+
19+
def setPreferredMethod(self, value):
20+
return self._java_obj.setPreferredMethod(value)
1121

12-
@staticmethod
13-
def createMap(input_path):
14-
return _int._OcrCreateMap(input_path).apply()
22+
def getPreferredMethod(self):
23+
return self._java_obj.getPreferredMethod()
1524

16-
@staticmethod
17-
def setPreferredMethod(value):
18-
return _int._OcrSetPreferredMethod(value).apply()
25+
def setFallbackMethod(self, value):
26+
return self._java_obj.setFallbackMethod(value)
1927

20-
@staticmethod
21-
def getPreferredMethod():
22-
return _int._OcrGetPreferredMethod().apply()
28+
def getFallbackMethod(self):
29+
return self._java_obj.getFallbackMethod()
2330

24-
@staticmethod
25-
def setFallbackMethod(value):
26-
return _int._OcrSetFallbackMethod(value).apply()
31+
def setMinSizeBeforeFallback(self, value):
32+
return self._java_obj.setMinSizeBeforeFallback(value)
2733

28-
@staticmethod
29-
def getFallbackMethod():
30-
return _int._OcrGetFallbackMethod().apply()
34+
def getMinSizeBeforeFallback(self):
35+
return self._java_obj.getMinSizeBeforeFallback()
3136

32-
@staticmethod
33-
def setMinSizeBeforeFallback(value):
34-
return _int._OcrSetMinSizeBeforeFallback(value).apply()
37+
def setEngineMode(self, mode):
38+
return self._java_obj.setEngineMode(mode)
3539

36-
@staticmethod
37-
def getMinSizeBeforeFallback():
38-
return _int._OcrGetMinSizeBeforeFallback().apply()
40+
def getEngineMode(self):
41+
return self._java_obj.getEngineMode()
3942

40-
@staticmethod
41-
def setEngineMode(mode):
42-
return _int._OcrSetEngineMode(mode).apply()
43+
def setPageSegMode(self, mode):
44+
return self._java_obj.setPageSegMode(mode)
4345

44-
@staticmethod
45-
def getEngineMode():
46-
return _int._OcrGetEngineMode().apply()
46+
def getPageSegMode(self):
47+
return self._java_obj.getPageSegMode()
4748

48-
@staticmethod
49-
def setPageSegMode(mode):
50-
return _int._OcrSetPageSegMode(mode).apply()
49+
def setPageIteratorLevel(self, level):
50+
return self._java_obj.setPageIteratorLevel(level)
5151

52-
@staticmethod
53-
def getPageSegMode():
54-
return _int._OcrGetPageSegMode().apply()
52+
def getPageIteratorLevel(self):
53+
return self._java_obj.getPageIteratorLevel()
5554

56-
@staticmethod
57-
def setPageIteratorLevel(level):
58-
return _int._OcrSetPageIteratorLevel(level).apply()
55+
def setScalingFactor(self, factor):
56+
return self._java_obj.setScalingFactor(factor)
5957

60-
@staticmethod
61-
def getPageIteratorLevel():
62-
return _int._OcrGetPageIteratorLevel().apply()
58+
def setSplitPages(self, value):
59+
return self._java_obj.setSplitPages(value)
6360

64-
@staticmethod
65-
def setScalingFactor(factor):
66-
return _int._OcrSetScalingFactor(factor).apply()
61+
def getSplitPages(self):
62+
return self._java_obj.getSplitPages()
6763

68-
@staticmethod
69-
def setSplitPages(value):
70-
return _int._OcrSetSplitPages(value).apply()
64+
def setSplitRegions(self, value):
65+
return self._java_obj.setSplitRegions(value)
7166

72-
@staticmethod
73-
def getSplitPages():
74-
return _int._OcrGetSplitPages().apply()
67+
def getSplitRegions(self):
68+
return self._java_obj.getSplitRegions()
7569

76-
@staticmethod
77-
def useErosion(use, k_size=2, k_shape=0):
78-
return _int._OcrUseErosion(use, k_size, k_shape).apply()
70+
def useErosion(self, use, k_size=2, k_shape=0):
71+
return self._java_obj.useErosion(use, k_size, k_shape)

python/test/annotators.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -543,22 +543,24 @@ def runTest():
543543
class OcrTestSpec(unittest.TestCase):
544544
@staticmethod
545545
def runTest():
546-
OcrHelper.setPreferredMethod('text')
547-
print("text layer is: " + str(OcrHelper.getPreferredMethod()))
546+
ocr = OcrHelper()
547+
ocr.setPreferredMethod('text')
548+
print("text layer is: " + str(ocr.getPreferredMethod()))
548549
pdf_path = "file:///" + os.getcwd() + "/../ocr/src/test/resources/pdfs/"
549-
data = OcrHelper.createDataset(
550+
data = ocr.createDataset(
550551
spark=SparkContextForTest.spark,
551552
input_path=pdf_path)
552553
data.show()
553-
OcrHelper.setPreferredMethod('image')
554-
print("Text layer disabled")
555-
data = OcrHelper.createDataset(
554+
ocr.setPreferredMethod('image')
555+
print("Text layer disabled. set to: ", ocr.getPreferredMethod())
556+
data = ocr.createDataset(
556557
spark=SparkContextForTest.spark,
557558
input_path=pdf_path)
558559
data.show()
559-
OcrHelper.setPreferredMethod('text')
560-
content = OcrHelper.createMap(input_path="../ocr/src/test/resources/pdfs")
561-
print(content)
560+
ocr.setPreferredMethod('text')
561+
print("Text layer enabled. set to: ", ocr.getPreferredMethod())
562+
content = ocr.createMap(input_path="../ocr/src/test/resources/pdfs")
563+
print("ocr create map: ", content)
562564
document_assembler = DocumentAssembler() \
563565
.setInputCol("text") \
564566
.setOutputCol("document")

0 commit comments

Comments
 (0)