1414
1515from fenic .api .functions import col
1616from fenic .core ._logical_plan .plans import DocSource , FileSource
17- from fenic .core .error import UnsupportedFileTypeError , ValidationError
17+ from fenic .core .error import ValidationError
1818from fenic .core .types .datatypes import JsonType , MarkdownType
1919
2020
@@ -263,63 +263,118 @@ def _read_file(
263263
264264 return DataFrame ._from_logical_plan (logical_node , self ._session_state )
265265
266- def docs (
266+ def markdown (
267267 self ,
268268 paths : Union [str , list [str ]],
269- data_type : Union [MarkdownType , JsonType ],
270269 exclude : Optional [str ] = None ,
271270 recursive : bool = False ,
272271 ) -> DataFrame :
273- r"""Load a DataFrame from a list of paths of documents ( markdown or json) .
272+ r"""Load a DataFrame from markdown files .
274273
275274 Args:
276275 paths: Glob pattern (or list of glob patterns) to the folder(s) to load.
277- data_type: Data type that will be used to cast the content of the files.
278- One of MarkdownType or JsonType.
279276 exclude: A regex pattern to exclude files.
280277 If it is not provided no files will be excluded.
281278 recursive: Whether to recursively load files from the folder.
282279
283280 Returns:
284- DataFrame: A dataframe with all the documents found in the paths.
281+ DataFrame: A dataframe with all the markdown documents found in the paths.
285282 Each document is a row in the dataframe.
286283
287284 Raises:
288- ValidationError: If any file does not have a `.md` or `.json` depending on the data_type.
289- UnsupportedFileTypeError: If the data_type is not supported.
285+ ValidationError: If any file does not have a `.md` extension.
290286
291287 Notes:
292288 - Each row in the dataframe corresponds to a file in the list of paths.
293289 - The dataframe has the following columns:
294- - file_path : The path to the file .
290+ - doc_path : The path to the document .
295291 - error: The error message if the file failed to be loaded.
296- - content: The content of the file casted to the data_type .
292+ - content: The content of the file casted to MarkdownType .
297293 - Recursive loading is supported in conjunction with the '**' glob pattern,
298294 e.g. `data/**/*.md` will load all markdown files in the `data` folder and all subfolders
299295 when recursive is set to True.
300296 Without recursive = True, then ** behaves like a single '*' pattern.
301297
302298 Example: Read all the markdown files in a folder and all its subfolders.
303299 ```python
304- df = session.read.docs ("data/docs/**/*.md", data_type=MarkdownType , recursive=True)
300+ df = session.read.markdown ("data/docs/**/*.md", recursive=True)
305301 ```
306302
307303 Example: Read a folder of markdown files excluding some files.
308304 ```python
309- df = session.read.docs ("data/docs/*.md", data_type=MarkdownType , exclude=r"\.bak.md$")
305+ df = session.read.markdown ("data/docs/*.md", exclude=r"\.bak.md$")
310306 ```
311307
312308 """
313- if data_type not in [MarkdownType , JsonType ]:
314- raise UnsupportedFileTypeError (f"Unsupported file type: { data_type } " )
309+ if isinstance (paths , str ):
310+ paths = [paths ]
311+
312+ logical_node = DocSource .from_session_state (
313+ paths = paths ,
314+ valid_file_extension = "md" ,
315+ exclude = exclude ,
316+ recursive = recursive ,
317+ session_state = self ._session_state ,
318+ )
319+ from fenic .api .dataframe import DataFrame
320+
321+ df = DataFrame ._from_logical_plan (logical_node , self ._session_state )
322+ df = df .select (
323+ col ("file_path" ).alias ("doc_path" ),
324+ col ("error" ),
325+ col ("content" ).cast (MarkdownType ).alias ("content" ),
326+ )
327+ return df
328+
329+ def json (
330+ self ,
331+ paths : Union [str , list [str ]],
332+ exclude : Optional [str ] = None ,
333+ recursive : bool = False ,
334+ ) -> DataFrame :
335+ r"""Load a DataFrame from JSON files.
315336
337+ Args:
338+ paths: Glob pattern (or list of glob patterns) to the folder(s) to load.
339+ exclude: A regex pattern to exclude files.
340+ If it is not provided no files will be excluded.
341+ recursive: Whether to recursively load files from the folder.
342+
343+ Returns:
344+ DataFrame: A dataframe with all the JSON documents found in the paths.
345+ Each document is a row in the dataframe.
346+
347+ Raises:
348+ ValidationError: If any file does not have a `.json` extension.
349+
350+ Notes:
351+ - Each row in the dataframe corresponds to a file in the list of paths.
352+ - The dataframe has the following columns:
353+ - doc_path: The path to the document.
354+ - error: The error message if the file failed to be loaded.
355+ - content: The content of the file casted to JsonType.
356+ - Recursive loading is supported in conjunction with the '**' glob pattern,
357+ e.g. `data/**/*.json` will load all JSON files in the `data` folder and all subfolders
358+ when recursive is set to True.
359+ Without recursive = True, then ** behaves like a single '*' pattern.
360+
361+ Example: Read all the JSON files in a folder and all its subfolders.
362+ ```python
363+ df = session.read.json("data/docs/**/*.json", recursive=True)
364+ ```
365+
366+ Example: Read a folder of JSON files excluding some files.
367+ ```python
368+ df = session.read.json("data/docs/*.json", exclude=r"\.bak.json$")
369+ ```
370+
371+ """
316372 if isinstance (paths , str ):
317373 paths = [paths ]
318374
319- valid_file_extension = "md" if data_type == MarkdownType else "json"
320375 logical_node = DocSource .from_session_state (
321376 paths = paths ,
322- valid_file_extension = valid_file_extension ,
377+ valid_file_extension = "json" ,
323378 exclude = exclude ,
324379 recursive = recursive ,
325380 session_state = self ._session_state ,
@@ -328,8 +383,8 @@ def docs(
328383
329384 df = DataFrame ._from_logical_plan (logical_node , self ._session_state )
330385 df = df .select (
331- col ("file_path" ),
386+ col ("file_path" ). alias ( "doc_path" ) ,
332387 col ("error" ),
333- col ("content" ).cast (data_type ).alias ("content" ),
388+ col ("content" ).cast (JsonType ).alias ("content" ),
334389 )
335390 return df
0 commit comments