mantzaris · SamiraJahangiri · Jun 9, 2025 · Jun 10, 2025 · Jun 12, 2025
diff --git a/Manifest.toml b/Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -14,6 +14,7 @@ Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
@@ -23,6 +24,9 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
+BytePairEncoding = "0.5.2"
+Revise = "3.8.0"
+TextEncodeBase = "0.8.3"
 Unicode = ">=1.10.8, <1.12"
 julia = ">=1.9"
 

diff --git a/src/TextSpace.jl b/src/TextSpace.jl
@@ -29,10 +29,11 @@ resource(name) = joinpath(@__DIR__, "resources", name) # e.g. "gpt2_merges.txt"
 include("preprocessing/__init__.jl")
 include("utils/__init__.jl") 
 
+
 include(joinpath(@__DIR__, "pipeline", "Pipeline.jl"))
 # #now use
+@reexport using .Plumbing
 @reexport using .Pipeline
-
 # high-level embeddings
 # include(joinpath(@__DIR__, "embeddings", "CharacterEmbeddings.jl"))
 # @reexport using .CharacterEmbeddings 

diff --git a/src/preprocessing/CleanText.jl b/src/preprocessing/CleanText.jl
@@ -106,18 +106,12 @@ Strip all combining diacritical marks (Unicode category *Mn*) from `text`
 while leaving base characters intact.  Works on Julia 1.6 - 1.11.
 """
 function remove_accents(text::AbstractString)::String
+    # normalize to NFD to separate characters from their accents
     nfd = Unicode.normalize(text, :NFD)
-
-    if isdefined(Unicode, :combining_class)      # >= 1.10 fast path
-        io = IOBuffer()
-        @inbounds for c in nfd
-            Unicode.combining_class(c) == 0 && write(io, c)
-        end
-        return Unicode.normalize(String(take!(io)), :NFC)
-    else                                          # 1.6 - 1.9 fallback
-        stripped = replace(nfd, r"\p{Mn}" => "")
-        return Unicode.normalize(stripped, :NFC)
-    end
+    # remove all combining diacritical marks using regex
+    stripped = replace(nfd, r"\p{Mn}" => "")
+    # normalize back to NFC for canonical representation
+    return Unicode.normalize(stripped, :NFC)
 end
 
 

diff --git a/src/preprocessing/__init__.jl b/src/preprocessing/__init__.jl
@@ -1,19 +1,20 @@
 module Plumbing
 
-
 include("CleanText.jl")
 include("TextNormalization.jl")
 include("Tokenization.jl")
 include("CharProcessing.jl")
 include("SentenceProcessing.jl")
 include("ParagraphProcessing.jl")
 
-
 export clean_text, strip_zero_width, normalize_whitespace,
-       tokenize, tokenize_batch,
-       tokenize_char,
-       split_sentences,
-       split_paragraphs,
-       filter_paragraphs
-
-end
+        remove_punctuation, remove_emojis, remove_accents,
+        tokenize, tokenize_batch, unwrap_lines,
+        tokenize_char, char_tokens,
+        split_sentences,
+        split_paragraphs,
+        filter_paragraphs, normalize_unicode, paragraph_windows, 
+        merge_short_paragraphs, _is_blank_paragraph, drop_empty_paragraph, 
+        strip_outer_quotes, SlidingSentenceWindow,
+        basic_tokenize, strip_punctuation, ngrams, WHITESPACE_REGEX
+end
diff --git a/test/pipeline/__init__.jl b/test/pipeline/__init__.jl
@@ -0,0 +1,2 @@
+# load all pipeline test files
+include("preprocessing_pipeline_tests.jl") 
diff --git a/test/preprocessing/__init__.jl b/test/preprocessing/__init__.jl
@@ -0,0 +1,8 @@
+# test/preprocessing/__init__.jl
+include("preprocessing_cleantext_tests.jl")
+include("preprocessing_textnormalization_tests.jl")
+include("preprocessing_tokenization_tests.jl")
+include("preprocessing_char_tests.jl")
+include("preprocessing_sentence_tests.jl")
+include("preprocessing_paragraph_tests.jl")
+include("preprocessing_subword_pipeline_tests.jl")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# load all pipeline test files
		include("preprocessing_pipeline_tests.jl")