feat: split reader.docs() into json() and markdown() (#179)

YoungVor · web-flow · commit 083d61a56fc1 · 2025-09-08T16:29:41.000-07:00
## What changed

- split the docs() reader function into json and markdown
- simplifies the API a little
- more extensible, easier to add new functions like
reader.pdf_metadata()
diff --git a/src/fenic/api/io/reader.py b/src/fenic/api/io/reader.py
@@ -14,7 +14,7 @@
 
 from fenic.api.functions import col
 from fenic.core._logical_plan.plans import DocSource, FileSource
-from fenic.core.error import UnsupportedFileTypeError, ValidationError
+from fenic.core.error import ValidationError
 from fenic.core.types.datatypes import JsonType, MarkdownType
 
 
@@ -263,63 +263,118 @@ def _read_file(
 
         return DataFrame._from_logical_plan(logical_node, self._session_state)
 
-    def docs(
+    def markdown(
             self,
             paths: Union[str, list[str]],
-            data_type: Union[MarkdownType, JsonType],
             exclude: Optional[str] = None,
             recursive: bool = False,
     ) -> DataFrame:
-        r"""Load a DataFrame from a list of paths of documents (markdown or json).
+        r"""Load a DataFrame from markdown files.
 
         Args:
             paths: Glob pattern (or list of glob patterns) to the folder(s) to load.
-            data_type: Data type that will be used to cast the content of the files.
-                       One of MarkdownType or JsonType.
             exclude: A regex pattern to exclude files.
                      If it is not provided no files will be excluded.
             recursive: Whether to recursively load files from the folder.
 
         Returns:
-            DataFrame: A dataframe with all the documents found in the paths.
+            DataFrame: A dataframe with all the markdown documents found in the paths.
                        Each document is a row in the dataframe.
 
         Raises:
-            ValidationError: If any file does not have a `.md` or `.json` depending on the data_type.
-            UnsupportedFileTypeError: If the data_type is not supported.
+            ValidationError: If any file does not have a `.md` extension.
 
         Notes:
             - Each row in the dataframe corresponds to a file in the list of paths.
             - The dataframe has the following columns:
-                - file_path: The path to the file.
+                - doc_path: The path to the document.
                 - error: The error message if the file failed to be loaded.
-                - content: The content of the file casted to the data_type.
+                - content: The content of the file casted to MarkdownType.
             - Recursive loading is supported in conjunction with the '**' glob pattern,
               e.g. `data/**/*.md` will load all markdown files in the `data` folder and all subfolders
                    when recursive is set to True.
               Without recursive = True, then ** behaves like a single '*' pattern.
 
         Example: Read all the markdown files in a folder and all its subfolders.
             ```python
-            df = session.read.docs("data/docs/**/*.md", data_type=MarkdownType, recursive=True)
+            df = session.read.markdown("data/docs/**/*.md", recursive=True)
             ```
 
         Example: Read a folder of markdown files excluding some files.
             ```python
-            df = session.read.docs("data/docs/*.md", data_type=MarkdownType, exclude=r"\.bak.md$")
+            df = session.read.markdown("data/docs/*.md", exclude=r"\.bak.md$")
             ```
 
         """
-        if data_type not in [MarkdownType, JsonType]:
-            raise UnsupportedFileTypeError(f"Unsupported file type: {data_type}")
+        if isinstance(paths, str):
+            paths = [paths]
+
+        logical_node = DocSource.from_session_state(
+            paths=paths,
+            valid_file_extension="md",
+            exclude=exclude,
+            recursive=recursive,
+            session_state=self._session_state,
+        )
+        from fenic.api.dataframe import DataFrame
+
+        df = DataFrame._from_logical_plan(logical_node, self._session_state)
+        df = df.select(
+            col("file_path").alias("doc_path"),
+            col("error"),
+            col("content").cast(MarkdownType).alias("content"),
+        )
+        return df
+
+    def json(
+            self,
+            paths: Union[str, list[str]],
+            exclude: Optional[str] = None,
+            recursive: bool = False,
+    ) -> DataFrame:
+        r"""Load a DataFrame from JSON files.
 
+        Args:
+            paths: Glob pattern (or list of glob patterns) to the folder(s) to load.
+            exclude: A regex pattern to exclude files.
+                     If it is not provided no files will be excluded.
+            recursive: Whether to recursively load files from the folder.
+
+        Returns:
+            DataFrame: A dataframe with all the JSON documents found in the paths.
+                       Each document is a row in the dataframe.
+
+        Raises:
+            ValidationError: If any file does not have a `.json` extension.
+
+        Notes:
+            - Each row in the dataframe corresponds to a file in the list of paths.
+            - The dataframe has the following columns:
+                - doc_path: The path to the document.
+                - error: The error message if the file failed to be loaded.
+                - content: The content of the file casted to JsonType.
+            - Recursive loading is supported in conjunction with the '**' glob pattern,
+              e.g. `data/**/*.json` will load all JSON files in the `data` folder and all subfolders
+                   when recursive is set to True.
+              Without recursive = True, then ** behaves like a single '*' pattern.
+
+        Example: Read all the JSON files in a folder and all its subfolders.
+            ```python
+            df = session.read.json("data/docs/**/*.json", recursive=True)
+            ```
+
+        Example: Read a folder of JSON files excluding some files.
+            ```python
+            df = session.read.json("data/docs/*.json", exclude=r"\.bak.json$")
+            ```
+
+        """
         if isinstance(paths, str):
             paths = [paths]
 
-        valid_file_extension = "md" if data_type == MarkdownType else "json"
         logical_node = DocSource.from_session_state(
             paths=paths,
-            valid_file_extension=valid_file_extension,
+            valid_file_extension="json",
             exclude=exclude,
             recursive=recursive,
             session_state=self._session_state,
@@ -328,8 +383,8 @@ def docs(
 
         df = DataFrame._from_logical_plan(logical_node, self._session_state)
         df = df.select(
-            col("file_path"),
+            col("file_path").alias("doc_path"),
             col("error"),
-            col("content").cast(data_type).alias("content"),
+            col("content").cast(JsonType).alias("content"),
         )
         return df
diff --git a/src/fenic/core/error.py b/src/fenic/core/error.py
@@ -263,14 +263,3 @@ def __init__(self, exception: Exception):
             exception: The exception that was raised.
         """
         super().__init__(f"File loader error: {exception}")
-
-class UnsupportedFileTypeError(FileLoaderError):
-    """Unsupported file type error."""
-
-    def __init__(self, file_type: DataType):
-        """Initialize a unsupported file type error.
-
-        Args:
-            file_type: The unsupported file type.
-        """
-        super().__init__(f"Unsupported file type for: {file_type}")
diff --git a/tests/_backends/local/io/test_reader.py b/tests/_backends/local/io/test_reader.py
@@ -17,6 +17,7 @@
     DoubleType,
     FloatType,
     IntegerType,
+    JsonType,
     MarkdownType,
     Schema,
     StringType,
@@ -34,7 +35,6 @@
     ConfigurationError,
     InternalError,
     PlanError,
-    UnsupportedFileTypeError,
     ValidationError,
 )
 
@@ -1195,75 +1195,87 @@ def test_view_schema_validation(local_session, temp_dir):
         local_session.view("df_csv_view")
 
 
-def test_read_docs(local_session, temp_dir_with_test_files):
-    """Test that reading from a folder works."""
-    df = local_session.read.docs(
+def test_read_markdown(local_session, temp_dir_with_test_files):
+    """Test that reading markdown files from a folder works."""
+    df = local_session.read.markdown(
         get_globbed_path(temp_dir_with_test_files, "**/*.md"),
-        data_type=MarkdownType,
         recursive=True)
     df.collect()
     assert df.schema == Schema(
         [
-            ColumnField(name="file_path", data_type=StringType),
+            ColumnField(name="doc_path", data_type=StringType),
             ColumnField(name="error", data_type=StringType),
             ColumnField(name="content", data_type=MarkdownType),
         ]
     )
     # generate the toc
     df = df.select(
-        col("file_path"),
+        col("doc_path"),
         markdown.generate_toc(col("content")).alias("toc")
     )
     dict = df.to_pydict()
-    assert len(dict["file_path"]) == 5
+    assert len(dict["doc_path"]) == 5
     assert "2 Background" in dict["toc"][0]
 
 
-def test_read_docs_invalid_path(local_session):
-    """Test that reading from an invalid path fails."""
+def test_read_markdown_invalid_path(local_session):
+    """Test that reading markdown from an invalid path fails."""
     with pytest.raises(ValidationError):
-        local_session.read.docs(
+        local_session.read.markdown(
             "/invalid/path",
-            data_type=MarkdownType,
             recursive=True)
 
 
-def test_read_docs_invalid_type(local_session, temp_dir_with_test_files):
-    """Test that reading from an invalid path fails."""
-    with pytest.raises(UnsupportedFileTypeError):
-        local_session.read.docs(
-            get_globbed_path(temp_dir_with_test_files, "**/*.md"),
-            data_type=StringType,
-            recursive=True)
+def test_read_json(local_session, temp_dir_with_test_files):
+    """Test that reading JSON files from a folder works."""
+    # Create a test JSON file
+    import json
+    json_path = Path(temp_dir_with_test_files) / "test.json"
+    with open(json_path, 'w') as f:
+        json.dump({"test": "data", "number": 42}, f)
+    
+    df = local_session.read.json(
+        get_globbed_path(temp_dir_with_test_files, "**/*.json"),
+        recursive=True)
+    df.collect()
+    assert df.schema == Schema(
+        [
+            ColumnField(name="doc_path", data_type=StringType),
+            ColumnField(name="error", data_type=StringType),
+            ColumnField(name="content", data_type=JsonType),
+        ]
+    )
+    results = df.to_pydict()
+    # There might be other JSON files in the test directory
+    assert len(results["doc_path"]) >= 1
+    # Verify our test file is in the results
+    assert any("test.json" in path for path in results["doc_path"])
 
-def test_read_docs_no_wildcard_only_valid_files(local_session, temp_dir_just_one_file):
+def test_read_markdown_no_wildcard_only_valid_files(local_session, temp_dir_just_one_file):
     """Test that reading from a path with (and no wild card) only valid files works."""
-    df = local_session.read.docs(
-        [temp_dir_just_one_file],
-        data_type=MarkdownType)
+    df = local_session.read.markdown(
+        [temp_dir_just_one_file])
     df.collect()
     dict = df.to_pydict()
-    assert len(dict["file_path"]) == 1
+    assert len(dict["doc_path"]) == 1
 
-def test_read_docs_no_files_valid_paths(local_session, temp_dir_with_test_files):
+def test_read_markdown_no_files_valid_paths(local_session, temp_dir_with_test_files):
     """Test that if no files are found, we'll get a dataframe with the path and an error message."""
-    df = local_session.read.docs(
+    df = local_session.read.markdown(
         get_globbed_path(temp_dir_with_test_files, "**/*.unknown_extension"),
-        data_type=MarkdownType,
         recursive=True)
     df.collect()
     results = df.to_pydict()
-    assert len(results["file_path"]) == 0
+    assert len(results["doc_path"]) == 0
 
-def test_read_docs_no_wildcard_path_is_file(local_session, temp_dir_just_one_file):
+def test_read_markdown_no_wildcard_path_is_file(local_session, temp_dir_just_one_file):
     """Test that reading from a path to a file works."""
-    df = local_session.read.docs(
-        [str(Path.joinpath(Path(temp_dir_just_one_file), "file1.md"))],
-        data_type=MarkdownType)
+    df = local_session.read.markdown(
+        [str(Path.joinpath(Path(temp_dir_just_one_file), "file1.md"))])
     df.collect()
     dict = df.to_pydict()
-    assert len(dict["file_path"]) == 1
-    assert "file1.md" in dict["file_path"][0]
+    assert len(dict["doc_path"]) == 1
+    assert "file1.md" in dict["doc_path"][0]
 
 def get_globbed_path(path: str, file_extension: str) -> list[str]:
     return [str(Path.joinpath(Path(path), file_extension))]
diff --git a/tests/_logical_plan/serde/test_plan_serde.py b/tests/_logical_plan/serde/test_plan_serde.py
@@ -52,7 +52,6 @@
 from fenic.core._serde.proto.serde_context import SerdeContext
 from fenic.core._serde.serde_protocol import SupportsLogicalPlanSerde
 from fenic.core.types import ClassDefinition
-from fenic.core.types.datatypes import MarkdownType
 from fenic.core.types.semantic_examples import MapExample, MapExampleCollection
 
 
@@ -80,7 +79,7 @@ def _create_plan_examples(session, temp_dir_with_test_files):
         DocSource: [
             ("doc_source", DocSource.from_session_state(
                 paths=[temp_dir_with_test_files],
-                valid_file_extension=".md",
+                valid_file_extension="md",
                 recursive=True,
                 session_state=session._session_state,
             )),
@@ -447,9 +446,8 @@ def test_table_source_plans(local_session, serde_implementation: SupportsLogical
 
 @pytest.mark.parametrize("serde_implementation", serde_implementations)
 def test_doc_source_plans(local_session, serde_implementation: SupportsLogicalPlanSerde, temp_dir_with_test_files):
-    df_docs = local_session.read.docs(
+    df_docs = local_session.read.markdown(
         [temp_dir_with_test_files],
-        data_type=MarkdownType,
         recursive=True
     )
     plan = df_docs._logical_plan