Skip to content

Commit 083d61a

Browse files
authored
feat: split reader.docs() into json() and markdown() (#179)
## What changed - split the docs() reader function into json and markdown - simplifies the API a little - more extensible, easier to add new functions like reader.pdf_metadata()
1 parent bd778e5 commit 083d61a

File tree

4 files changed

+122
-68
lines changed

4 files changed

+122
-68
lines changed

src/fenic/api/io/reader.py

Lines changed: 74 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from fenic.api.functions import col
1616
from fenic.core._logical_plan.plans import DocSource, FileSource
17-
from fenic.core.error import UnsupportedFileTypeError, ValidationError
17+
from fenic.core.error import ValidationError
1818
from fenic.core.types.datatypes import JsonType, MarkdownType
1919

2020

@@ -263,63 +263,118 @@ def _read_file(
263263

264264
return DataFrame._from_logical_plan(logical_node, self._session_state)
265265

266-
def docs(
266+
def markdown(
267267
self,
268268
paths: Union[str, list[str]],
269-
data_type: Union[MarkdownType, JsonType],
270269
exclude: Optional[str] = None,
271270
recursive: bool = False,
272271
) -> DataFrame:
273-
r"""Load a DataFrame from a list of paths of documents (markdown or json).
272+
r"""Load a DataFrame from markdown files.
274273
275274
Args:
276275
paths: Glob pattern (or list of glob patterns) to the folder(s) to load.
277-
data_type: Data type that will be used to cast the content of the files.
278-
One of MarkdownType or JsonType.
279276
exclude: A regex pattern to exclude files.
280277
If it is not provided no files will be excluded.
281278
recursive: Whether to recursively load files from the folder.
282279
283280
Returns:
284-
DataFrame: A dataframe with all the documents found in the paths.
281+
DataFrame: A dataframe with all the markdown documents found in the paths.
285282
Each document is a row in the dataframe.
286283
287284
Raises:
288-
ValidationError: If any file does not have a `.md` or `.json` depending on the data_type.
289-
UnsupportedFileTypeError: If the data_type is not supported.
285+
ValidationError: If any file does not have a `.md` extension.
290286
291287
Notes:
292288
- Each row in the dataframe corresponds to a file in the list of paths.
293289
- The dataframe has the following columns:
294-
- file_path: The path to the file.
290+
- doc_path: The path to the document.
295291
- error: The error message if the file failed to be loaded.
296-
- content: The content of the file casted to the data_type.
292+
- content: The content of the file casted to MarkdownType.
297293
- Recursive loading is supported in conjunction with the '**' glob pattern,
298294
e.g. `data/**/*.md` will load all markdown files in the `data` folder and all subfolders
299295
when recursive is set to True.
300296
Without recursive = True, then ** behaves like a single '*' pattern.
301297
302298
Example: Read all the markdown files in a folder and all its subfolders.
303299
```python
304-
df = session.read.docs("data/docs/**/*.md", data_type=MarkdownType, recursive=True)
300+
df = session.read.markdown("data/docs/**/*.md", recursive=True)
305301
```
306302
307303
Example: Read a folder of markdown files excluding some files.
308304
```python
309-
df = session.read.docs("data/docs/*.md", data_type=MarkdownType, exclude=r"\.bak.md$")
305+
df = session.read.markdown("data/docs/*.md", exclude=r"\.bak.md$")
310306
```
311307
312308
"""
313-
if data_type not in [MarkdownType, JsonType]:
314-
raise UnsupportedFileTypeError(f"Unsupported file type: {data_type}")
309+
if isinstance(paths, str):
310+
paths = [paths]
311+
312+
logical_node = DocSource.from_session_state(
313+
paths=paths,
314+
valid_file_extension="md",
315+
exclude=exclude,
316+
recursive=recursive,
317+
session_state=self._session_state,
318+
)
319+
from fenic.api.dataframe import DataFrame
320+
321+
df = DataFrame._from_logical_plan(logical_node, self._session_state)
322+
df = df.select(
323+
col("file_path").alias("doc_path"),
324+
col("error"),
325+
col("content").cast(MarkdownType).alias("content"),
326+
)
327+
return df
328+
329+
def json(
330+
self,
331+
paths: Union[str, list[str]],
332+
exclude: Optional[str] = None,
333+
recursive: bool = False,
334+
) -> DataFrame:
335+
r"""Load a DataFrame from JSON files.
315336
337+
Args:
338+
paths: Glob pattern (or list of glob patterns) to the folder(s) to load.
339+
exclude: A regex pattern to exclude files.
340+
If it is not provided no files will be excluded.
341+
recursive: Whether to recursively load files from the folder.
342+
343+
Returns:
344+
DataFrame: A dataframe with all the JSON documents found in the paths.
345+
Each document is a row in the dataframe.
346+
347+
Raises:
348+
ValidationError: If any file does not have a `.json` extension.
349+
350+
Notes:
351+
- Each row in the dataframe corresponds to a file in the list of paths.
352+
- The dataframe has the following columns:
353+
- doc_path: The path to the document.
354+
- error: The error message if the file failed to be loaded.
355+
- content: The content of the file casted to JsonType.
356+
- Recursive loading is supported in conjunction with the '**' glob pattern,
357+
e.g. `data/**/*.json` will load all JSON files in the `data` folder and all subfolders
358+
when recursive is set to True.
359+
Without recursive = True, then ** behaves like a single '*' pattern.
360+
361+
Example: Read all the JSON files in a folder and all its subfolders.
362+
```python
363+
df = session.read.json("data/docs/**/*.json", recursive=True)
364+
```
365+
366+
Example: Read a folder of JSON files excluding some files.
367+
```python
368+
df = session.read.json("data/docs/*.json", exclude=r"\.bak.json$")
369+
```
370+
371+
"""
316372
if isinstance(paths, str):
317373
paths = [paths]
318374

319-
valid_file_extension = "md" if data_type == MarkdownType else "json"
320375
logical_node = DocSource.from_session_state(
321376
paths=paths,
322-
valid_file_extension=valid_file_extension,
377+
valid_file_extension="json",
323378
exclude=exclude,
324379
recursive=recursive,
325380
session_state=self._session_state,
@@ -328,8 +383,8 @@ def docs(
328383

329384
df = DataFrame._from_logical_plan(logical_node, self._session_state)
330385
df = df.select(
331-
col("file_path"),
386+
col("file_path").alias("doc_path"),
332387
col("error"),
333-
col("content").cast(data_type).alias("content"),
388+
col("content").cast(JsonType).alias("content"),
334389
)
335390
return df

src/fenic/core/error.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -263,14 +263,3 @@ def __init__(self, exception: Exception):
263263
exception: The exception that was raised.
264264
"""
265265
super().__init__(f"File loader error: {exception}")
266-
267-
class UnsupportedFileTypeError(FileLoaderError):
268-
"""Unsupported file type error."""
269-
270-
def __init__(self, file_type: DataType):
271-
"""Initialize a unsupported file type error.
272-
273-
Args:
274-
file_type: The unsupported file type.
275-
"""
276-
super().__init__(f"Unsupported file type for: {file_type}")

tests/_backends/local/io/test_reader.py

Lines changed: 46 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
DoubleType,
1818
FloatType,
1919
IntegerType,
20+
JsonType,
2021
MarkdownType,
2122
Schema,
2223
StringType,
@@ -34,7 +35,6 @@
3435
ConfigurationError,
3536
InternalError,
3637
PlanError,
37-
UnsupportedFileTypeError,
3838
ValidationError,
3939
)
4040

@@ -1195,75 +1195,87 @@ def test_view_schema_validation(local_session, temp_dir):
11951195
local_session.view("df_csv_view")
11961196

11971197

1198-
def test_read_docs(local_session, temp_dir_with_test_files):
1199-
"""Test that reading from a folder works."""
1200-
df = local_session.read.docs(
1198+
def test_read_markdown(local_session, temp_dir_with_test_files):
1199+
"""Test that reading markdown files from a folder works."""
1200+
df = local_session.read.markdown(
12011201
get_globbed_path(temp_dir_with_test_files, "**/*.md"),
1202-
data_type=MarkdownType,
12031202
recursive=True)
12041203
df.collect()
12051204
assert df.schema == Schema(
12061205
[
1207-
ColumnField(name="file_path", data_type=StringType),
1206+
ColumnField(name="doc_path", data_type=StringType),
12081207
ColumnField(name="error", data_type=StringType),
12091208
ColumnField(name="content", data_type=MarkdownType),
12101209
]
12111210
)
12121211
# generate the toc
12131212
df = df.select(
1214-
col("file_path"),
1213+
col("doc_path"),
12151214
markdown.generate_toc(col("content")).alias("toc")
12161215
)
12171216
dict = df.to_pydict()
1218-
assert len(dict["file_path"]) == 5
1217+
assert len(dict["doc_path"]) == 5
12191218
assert "2 Background" in dict["toc"][0]
12201219

12211220

1222-
def test_read_docs_invalid_path(local_session):
1223-
"""Test that reading from an invalid path fails."""
1221+
def test_read_markdown_invalid_path(local_session):
1222+
"""Test that reading markdown from an invalid path fails."""
12241223
with pytest.raises(ValidationError):
1225-
local_session.read.docs(
1224+
local_session.read.markdown(
12261225
"/invalid/path",
1227-
data_type=MarkdownType,
12281226
recursive=True)
12291227

12301228

1231-
def test_read_docs_invalid_type(local_session, temp_dir_with_test_files):
1232-
"""Test that reading from an invalid path fails."""
1233-
with pytest.raises(UnsupportedFileTypeError):
1234-
local_session.read.docs(
1235-
get_globbed_path(temp_dir_with_test_files, "**/*.md"),
1236-
data_type=StringType,
1237-
recursive=True)
1229+
def test_read_json(local_session, temp_dir_with_test_files):
1230+
"""Test that reading JSON files from a folder works."""
1231+
# Create a test JSON file
1232+
import json
1233+
json_path = Path(temp_dir_with_test_files) / "test.json"
1234+
with open(json_path, 'w') as f:
1235+
json.dump({"test": "data", "number": 42}, f)
1236+
1237+
df = local_session.read.json(
1238+
get_globbed_path(temp_dir_with_test_files, "**/*.json"),
1239+
recursive=True)
1240+
df.collect()
1241+
assert df.schema == Schema(
1242+
[
1243+
ColumnField(name="doc_path", data_type=StringType),
1244+
ColumnField(name="error", data_type=StringType),
1245+
ColumnField(name="content", data_type=JsonType),
1246+
]
1247+
)
1248+
results = df.to_pydict()
1249+
# There might be other JSON files in the test directory
1250+
assert len(results["doc_path"]) >= 1
1251+
# Verify our test file is in the results
1252+
assert any("test.json" in path for path in results["doc_path"])
12381253

1239-
def test_read_docs_no_wildcard_only_valid_files(local_session, temp_dir_just_one_file):
1254+
def test_read_markdown_no_wildcard_only_valid_files(local_session, temp_dir_just_one_file):
12401255
"""Test that reading from a path with (and no wild card) only valid files works."""
1241-
df = local_session.read.docs(
1242-
[temp_dir_just_one_file],
1243-
data_type=MarkdownType)
1256+
df = local_session.read.markdown(
1257+
[temp_dir_just_one_file])
12441258
df.collect()
12451259
dict = df.to_pydict()
1246-
assert len(dict["file_path"]) == 1
1260+
assert len(dict["doc_path"]) == 1
12471261

1248-
def test_read_docs_no_files_valid_paths(local_session, temp_dir_with_test_files):
1262+
def test_read_markdown_no_files_valid_paths(local_session, temp_dir_with_test_files):
12491263
"""Test that if no files are found, we'll get a dataframe with the path and an error message."""
1250-
df = local_session.read.docs(
1264+
df = local_session.read.markdown(
12511265
get_globbed_path(temp_dir_with_test_files, "**/*.unknown_extension"),
1252-
data_type=MarkdownType,
12531266
recursive=True)
12541267
df.collect()
12551268
results = df.to_pydict()
1256-
assert len(results["file_path"]) == 0
1269+
assert len(results["doc_path"]) == 0
12571270

1258-
def test_read_docs_no_wildcard_path_is_file(local_session, temp_dir_just_one_file):
1271+
def test_read_markdown_no_wildcard_path_is_file(local_session, temp_dir_just_one_file):
12591272
"""Test that reading from a path to a file works."""
1260-
df = local_session.read.docs(
1261-
[str(Path.joinpath(Path(temp_dir_just_one_file), "file1.md"))],
1262-
data_type=MarkdownType)
1273+
df = local_session.read.markdown(
1274+
[str(Path.joinpath(Path(temp_dir_just_one_file), "file1.md"))])
12631275
df.collect()
12641276
dict = df.to_pydict()
1265-
assert len(dict["file_path"]) == 1
1266-
assert "file1.md" in dict["file_path"][0]
1277+
assert len(dict["doc_path"]) == 1
1278+
assert "file1.md" in dict["doc_path"][0]
12671279

12681280
def get_globbed_path(path: str, file_extension: str) -> list[str]:
12691281
return [str(Path.joinpath(Path(path), file_extension))]

tests/_logical_plan/serde/test_plan_serde.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@
5252
from fenic.core._serde.proto.serde_context import SerdeContext
5353
from fenic.core._serde.serde_protocol import SupportsLogicalPlanSerde
5454
from fenic.core.types import ClassDefinition
55-
from fenic.core.types.datatypes import MarkdownType
5655
from fenic.core.types.semantic_examples import MapExample, MapExampleCollection
5756

5857

@@ -80,7 +79,7 @@ def _create_plan_examples(session, temp_dir_with_test_files):
8079
DocSource: [
8180
("doc_source", DocSource.from_session_state(
8281
paths=[temp_dir_with_test_files],
83-
valid_file_extension=".md",
82+
valid_file_extension="md",
8483
recursive=True,
8584
session_state=session._session_state,
8685
)),
@@ -447,9 +446,8 @@ def test_table_source_plans(local_session, serde_implementation: SupportsLogical
447446

448447
@pytest.mark.parametrize("serde_implementation", serde_implementations)
449448
def test_doc_source_plans(local_session, serde_implementation: SupportsLogicalPlanSerde, temp_dir_with_test_files):
450-
df_docs = local_session.read.docs(
449+
df_docs = local_session.read.markdown(
451450
[temp_dir_with_test_files],
452-
data_type=MarkdownType,
453451
recursive=True
454452
)
455453
plan = df_docs._logical_plan

0 commit comments

Comments
 (0)