[SPARKNLP-1291] Adding support fort input string column on readers

danilojsl · danilojsl · commit 21e651d89cc7 · 2025-09-30T10:36:38.000-05:00
diff --git a/python/sparknlp/partition/partition_properties.py b/python/sparknlp/partition/partition_properties.py
@@ -18,13 +18,40 @@
 
 class HasReaderProperties(Params):
 
+    inputCol = Param(
+        Params._dummy(),
+        "inputCol",
+        "input column name",
+        typeConverter=TypeConverters.toString
+    )
+
+    def setInputCol(self, value):
+        """Sets input column name.
+
+        Parameters
+        ----------
+        value : str
+            Name of the Input Column
+        """
+        return self._set(inputCol=value)
+
     outputCol = Param(
         Params._dummy(),
         "outputCol",
         "output column name",
         typeConverter=TypeConverters.toString
     )
 
+    def setOutputCol(self, value):
+        """Sets output column name.
+
+        Parameters
+        ----------
+        value : str
+            Name of the Output Column
+        """
+        return self._set(outputCol=value)
+
     contentPath = Param(
         Params._dummy(),
         "contentPath",
@@ -683,13 +710,3 @@ def setReadAsImage(self, value: bool):
             True to read as images, False otherwise.
         """
         return self._set(readAsImage=value)
-
-    def setOutputCol(self, value):
-        """Sets output column name.
-
-        Parameters
-        ----------
-        value : str
-            Name of the Output Column
-        """
-        return self._set(outputCol=value)
diff --git a/python/test/reader/reader2doc_test.py b/python/test/reader/reader2doc_test.py
@@ -111,4 +111,24 @@ def runTest(self):
 
         result_df = model.transform(self.empty_df)
 
+        self.assertTrue(result_df.select("document").count() > 0)
+
+@pytest.mark.fast
+class Reader2DocTestInputColumn(unittest.TestCase):
+
+    def setUp(self):
+        spark = SparkContextForTest.spark
+        content = "<html><head><title>Test<title><body><p>Unclosed tag"
+        self.html_df = spark.createDataFrame([(1, content)], ["id", "html"])
+
+    def runTest(self):
+        reader2doc = Reader2Doc() \
+            .setInputCol("html") \
+            .setOutputCol("document")
+
+        pipeline = Pipeline(stages=[reader2doc])
+        model = pipeline.fit(self.html_df)
+
+        result_df = model.transform(self.html_df)
+
         self.assertTrue(result_df.select("document").count() > 0)
diff --git a/python/test/reader/reader2table_test.py b/python/test/reader/reader2table_test.py
@@ -40,7 +40,6 @@ def runTest(self):
         model = pipeline.fit(self.empty_df)
 
         result_df = model.transform(self.empty_df)
-        result_df.show(truncate=False)
 
         self.assertTrue(result_df.select("document").count() > 0)
 
@@ -60,4 +59,35 @@ def runTest(self):
 
         result_df = model.transform(self.empty_df)
 
-        self.assertTrue(result_df.select("document").count() > 1)
+        self.assertTrue(result_df.select("document").count() > 1)
+
+@pytest.mark.fast
+class Reader2TableInputColTest(unittest.TestCase):
+
+    def setUp(self):
+        content = """
+                <html>
+                  <body>
+                    <table>
+                      <tr>
+                        <td>Hello World</td>
+                      </tr>
+                    </table>
+                  </body>
+                </html>
+                """
+        spark = SparkContextForTest.spark
+        self.html_df = spark.createDataFrame([(1, content)], ["id", "html"])
+
+    def runTest(self):
+        reader2table = Reader2Table() \
+            .setInputCol("html") \
+            .setContentType("text/html") \
+            .setOutputCol("document")
+
+        pipeline = Pipeline(stages=[reader2table])
+        model = pipeline.fit(self.html_df)
+
+        result_df = model.transform(self.html_df)
+
+        self.assertTrue(result_df.select("document").count() > 0)
diff --git a/src/main/scala/com/johnsnowlabs/partition/HasReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasReaderProperties.scala
@@ -19,6 +19,13 @@ import org.apache.spark.ml.param.{BooleanParam, Param}
 
 trait HasReaderProperties extends HasHTMLReaderProperties {
 
+  protected final val inputCol: Param[String] =
+    new Param(this, "inputCol", "input column to process")
+
+  final def setInputCol(value: String): this.type = set(inputCol, value)
+
+  final def getInputCol: String = $(inputCol)
+
   val contentPath = new Param[String](this, "contentPath", "Path to the content source")
 
   def setContentPath(value: String): this.type = set(contentPath, value)
@@ -75,6 +82,7 @@ trait HasReaderProperties extends HasHTMLReaderProperties {
     titleFontSize -> 9,
     inferTableStructure -> false,
     includePageBreaks -> false,
-    ignoreExceptions -> true)
+    ignoreExceptions -> true,
+    inputCol -> "")
 
 }
diff --git a/src/main/scala/com/johnsnowlabs/reader/HasReaderContent.scala b/src/main/scala/com/johnsnowlabs/reader/HasReaderContent.scala
@@ -104,12 +104,11 @@ trait HasReaderContent extends HasReaderProperties {
     }
   }
 
-  def partitionContent(
+  private def partitionContentFromPath(
       partition: Partition,
       contentPath: String,
       isText: Boolean,
       dataset: Dataset[_]): DataFrame = {
-
     val ext = contentPath.split("\\.").lastOption.getOrElse("").toLowerCase
     if (! $(ignoreExceptions) && !supportedTypes.contains(ext)) {
       return buildErrorDataFrame(dataset, contentPath, ext)
@@ -148,6 +147,38 @@ trait HasReaderContent extends HasReaderProperties {
     } else partitionDf
   }
 
+  def partitionContent(
+      partition: Partition,
+      contentPath: String,
+      isText: Boolean,
+      dataset: Dataset[_]): DataFrame = {
+
+    val partitionDf =
+      if (getInputCol != null && getInputCol.nonEmpty) {
+        partitionContentFromDataFrame(partition, dataset, getInputCol)
+      } else {
+        partitionContentFromPath(partition, contentPath, isText, dataset)
+      }
+
+    if ($(ignoreExceptions)) {
+      partitionDf.filter(col("exception").isNull)
+    } else partitionDf
+  }
+
+  /** Partition content when it is already present in a dataset column. */
+  private def partitionContentFromDataFrame(
+      partition: Partition,
+      dataset: Dataset[_],
+      inputCol: String): DataFrame = {
+    val partitionUDF =
+      udf((text: String) => partition.partitionStringContent(text, $(this.headers).asJava))
+
+    dataset
+      .withColumn(partition.getOutputColumn, partitionUDF(col(inputCol)))
+      .withColumn("fileName", lit(null: String))
+      .withColumn("exception", lit(null: String))
+  }
+
   val getFileName: UserDefinedFunction = udf { path: String =>
     if (path != null) path.split("/").last else ""
   }
@@ -166,4 +197,8 @@ trait HasReaderContent extends HasReaderProperties {
     dataset.sparkSession.createDataFrame(emptyRDD, schema)
   }
 
+  def getContentType: String = {
+    if ($(contentType).trim.isEmpty && getInputCol.nonEmpty) "text/plain" else $(contentType)
+  }
+
 }
diff --git a/src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala b/src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala
@@ -127,13 +127,13 @@ class Reader2Doc(override val uid: String)
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     validateRequiredParameters()
-    val structuredDf = if ($(contentType).trim.isEmpty) {
+    val structuredDf = if ($(contentType).trim.isEmpty && getInputCol.trim.isEmpty) {
       val partitionParams = Map(
         "inferTableStructure" -> $(inferTableStructure).toString,
         "outputFormat" -> $(outputFormat))
       partitionMixedContent(dataset, $(contentPath), partitionParams)
     } else {
-      partitionContent(partitionBuilder, $(contentPath), isStringContent($(contentType)), dataset)
+      partitionContent(partitionBuilder, $(contentPath), isStringContent(getContentType), dataset)
     }
     if (!structuredDf.isEmpty) {
       val annotatedDf = structuredDf
@@ -149,7 +149,7 @@ class Reader2Doc(override val uid: String)
 
   protected def partitionBuilder: Partition = {
     val params = Map(
-      "contentType" -> $(contentType),
+      "contentType" -> getContentType,
       "storeContent" -> $(storeContent).toString,
       "titleFontSize" -> $(titleFontSize).toString,
       "inferTableStructure" -> $(inferTableStructure).toString,
@@ -186,15 +186,16 @@ class Reader2Doc(override val uid: String)
   }
 
   protected def validateRequiredParameters(): Unit = {
-    require(
-      $(contentPath) != null && $(contentPath).trim.nonEmpty,
-      "contentPath must be set and not empty")
+    val hasContentPath = $(contentPath) != null && $(contentPath).trim.nonEmpty
+    if (hasContentPath) {
+      require(
+        ResourceHelper.validFile($(contentPath)),
+        "contentPath must point to a valid file or directory")
+    }
+
     require(
       $(outputFormat) == "plain-text",
       "Only 'plain-text' outputFormat is supported for this operation.")
-    require(
-      ResourceHelper.validFile($(contentPath)),
-      "contentPath must point to a valid file or directory")
   }
 
   protected def partitionToAnnotation: UserDefinedFunction = udf {
diff --git a/src/main/scala/com/johnsnowlabs/reader/Reader2Table.scala b/src/main/scala/com/johnsnowlabs/reader/Reader2Table.scala
@@ -16,6 +16,7 @@
 package com.johnsnowlabs.reader
 
 import com.johnsnowlabs.nlp.Annotation
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.functions.udf
@@ -84,7 +85,7 @@ class Reader2Table(override val uid: String) extends Reader2Doc {
   }
 
   private def getAcceptedTypes(fileName: String): Set[String] = {
-    if (fileName.isEmpty) {
+    if (fileName == null || fileName.isEmpty) {
       val officeDocTypes = Set(
         "application/msword",
         "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -171,9 +172,12 @@ class Reader2Table(override val uid: String) extends Reader2Doc {
   }
 
   override def validateRequiredParameters(): Unit = {
-    require(
-      $(contentPath) != null && $(contentPath).trim.nonEmpty,
-      "contentPath must be set and not empty")
+    val hasContentPath = $(contentPath) != null && $(contentPath).trim.nonEmpty
+    if (hasContentPath) {
+      require(
+        ResourceHelper.validFile($(contentPath)),
+        "contentPath must point to a valid file or directory")
+    }
     require(
       Set("html-table", "json-table").contains($(outputFormat)),
       "outputFormat must be either 'html-table' or 'json-table'.")
diff --git a/src/main/scala/com/johnsnowlabs/reader/util/HTMLParser.scala b/src/main/scala/com/johnsnowlabs/reader/util/HTMLParser.scala
@@ -87,23 +87,23 @@ object HTMLParser {
   def tableElementToJson(tableElem: Element): String = {
     implicit val formats = Serialization.formats(NoTypeHints)
 
-    val caption = Option(tableElem.selectFirst("caption")).map(_.text.trim).getOrElse("")
+    val caption = Option(tableElem.selectFirst("caption"))
+      .map(_.text.trim)
+      .getOrElse("")
 
-    // Headers: first row with th or td as header
-    val headerRowOpt = tableElem
-      .select("tr")
-      .asScala
-      .find(tr => tr.select("th,td").asScala.nonEmpty && tr.select("th").asScala.nonEmpty)
+    val allRows = tableElem.select("tr").asScala.toList
+
+    val headerRowOpt = allRows.find(tr => tr.select("th").asScala.nonEmpty)
 
     val headers: List[String] = headerRowOpt
       .map(_.select("th,td").asScala.map(_.text.trim).toList)
       .getOrElse(List.empty)
 
-    val allRows = tableElem.select("tr").asScala.toList
-    val headerIndex = headerRowOpt.map(allRows.indexOf).getOrElse(0)
+    val headerIndexOpt = headerRowOpt.map(allRows.indexOf)
+
     val dataRows =
       allRows.zipWithIndex
-        .filter { case (_, idx) => idx != headerIndex } // skip header row
+        .filter { case (_, idx) => !headerIndexOpt.contains(idx) }
         .map(_._1)
         .map(row => row.select("td").asScala.map(_.text.trim).toList)
         .filter(_.nonEmpty)
diff --git a/src/test/scala/com/johnsnowlabs/reader/Reader2DocTest.scala b/src/test/scala/com/johnsnowlabs/reader/Reader2DocTest.scala
diff --git a/src/test/scala/com/johnsnowlabs/reader/Reader2TableTest.scala b/src/test/scala/com/johnsnowlabs/reader/Reader2TableTest.scala