diff --git a/Manifest.toml b/Manifest.toml
index f2e5ec9..c990627 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,8 +1,8 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.9.3"
+julia_version = "1.11.4"
 manifest_format = "2.0"
-project_hash = "aa82c28e8908bfa177480b2eb90c73573e8c6c54"
+project_hash = "8d86f861eb2145d57a00b480d416501d1e0998c6"
 
 [[deps.AbstractFFTs]]
 deps = ["LinearAlgebra"]
@@ -17,9 +17,9 @@ weakdeps = ["ChainRulesCore", "Test"]
 
 [[deps.Accessors]]
 deps = ["CompositionsBase", "ConstructionBase", "Dates", "InverseFunctions", "MacroTools"]
-git-tree-sha1 = "0ba8f4c1f06707985ffb4804fdad1bf97b233897"
+git-tree-sha1 = "3b86719127f50670efe356bc11073d84b4ed7a5d"
 uuid = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
-version = "0.1.41"
+version = "0.1.42"
 
     [deps.Accessors.extensions]
     AxisKeysExt = "AxisKeys"
@@ -34,7 +34,6 @@ version = "0.1.41"
     AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5"
     IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
     LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-    Requires = "ae029012-a4dd-5104-9daa-d747884805df"
     StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
     StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
     Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
@@ -58,28 +57,41 @@ uuid = "66dad0bd-aa9a-41b7-9441-69ab47430ed8"
 version = "1.1.3"
 
 [[deps.ArgCheck]]
-git-tree-sha1 = "680b3b8759bd4c54052ada14e52355ab69e07876"
+git-tree-sha1 = "f9e9a66c9b7be1ad7372bbd9b062d9230c30c5ce"
 uuid = "dce04be8-c92d-5529-be00-80e4d2c0e197"
-version = "2.4.0"
+version = "2.5.0"
 
 [[deps.ArgTools]]
 uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
-version = "1.1.1"
+version = "1.1.2"
 
 [[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+version = "1.11.0"
 
 [[deps.Atomix]]
 deps = ["UnsafeAtomics"]
-git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
+git-tree-sha1 = "b5bb4dc6248fde467be2a863eb8452993e74d402"
 uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
-version = "0.1.0"
+version = "1.1.1"
+
+    [deps.Atomix.extensions]
+    AtomixCUDAExt = "CUDA"
+    AtomixMetalExt = "Metal"
+    AtomixOpenCLExt = "OpenCL"
+    AtomixoneAPIExt = "oneAPI"
+
+    [deps.Atomix.weakdeps]
+    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+    Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+    OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
+    oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [[deps.BangBang]]
-deps = ["Accessors", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires"]
-git-tree-sha1 = "e2144b631226d9eeab2d746ca8880b7ccff504ae"
+deps = ["Accessors", "ConstructionBase", "InitialValues", "LinearAlgebra"]
+git-tree-sha1 = "26f41e1df02c330c4fa1e98d4aa2168fdafc9b1f"
 uuid = "198e06fe-97b7-11e9-32a5-e1d131e6ad66"
-version = "0.4.3"
+version = "0.4.4"
 
     [deps.BangBang.extensions]
     BangBangChainRulesCoreExt = "ChainRulesCore"
@@ -99,6 +111,7 @@ version = "0.4.3"
 
 [[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+version = "1.11.0"
 
 [[deps.Baselet]]
 git-tree-sha1 = "aebf55e6d7795e02ca500a689d326ac979aaf89e"
@@ -113,8 +126,6 @@ version = "0.1.9"
 [[deps.BytePairEncoding]]
 deps = ["Artifacts", "Base64", "DataStructures", "DoubleArrayTries", "LRUCache", "LazyArtifacts", "StructWalk", "TextEncodeBase", "Unicode"]
 git-tree-sha1 = "b8d2edaf190d01d6a1c30b80d1db2d866fbe7371"
-repo-rev = "master"
-repo-url = "https://github.com/chengchingwen/BytePairEncoding.jl.git"
 uuid = "a4280ba5-8788-555a-8ca8-4a8c3d966a71"
 version = "0.5.2"
 
@@ -125,9 +136,9 @@ version = "0.5.0"
 
 [[deps.ChainRules]]
 deps = ["Adapt", "ChainRulesCore", "Compat", "Distributed", "GPUArraysCore", "IrrationalConstants", "LinearAlgebra", "Random", "RealDot", "SparseArrays", "SparseInverseSubset", "Statistics", "StructArrays", "SuiteSparse"]
-git-tree-sha1 = "a975ae558af61a2a48720a6271661bf2621e0f4e"
+git-tree-sha1 = "204e9b212da5cc7df632b58af8d49763383f47fa"
 uuid = "082447d4-558c-5d27-93f4-14fc19e9eca2"
-version = "1.72.3"
+version = "1.72.4"
 
 [[deps.ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra"]
@@ -139,6 +150,12 @@ weakdeps = ["SparseArrays"]
     [deps.ChainRulesCore.extensions]
     ChainRulesCoreSparseArraysExt = "SparseArrays"
 
+[[deps.CodeTracking]]
+deps = ["InteractiveUtils", "UUIDs"]
+git-tree-sha1 = "062c5e1a5bf6ada13db96a4ae4749a4c2234f521"
+uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
+version = "1.3.9"
+
 [[deps.CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
 git-tree-sha1 = "962834c22b66e32aa10f7611c08c8ca4e20749a9"
@@ -164,7 +181,7 @@ weakdeps = ["Dates", "LinearAlgebra"]
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.5+0"
+version = "1.1.1+0"
 
 [[deps.CompositionsBase]]
 git-tree-sha1 = "802bb88cd69dfd1509f6670416bd4434015693ad"
@@ -227,6 +244,7 @@ version = "1.0.0"
 [[deps.Dates]]
 deps = ["Printf"]
 uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+version = "1.11.0"
 
 [[deps.DefineSingletons]]
 git-tree-sha1 = "0fba8b706d0178b4dc7fd44a96a92382c9065c2c"
@@ -254,11 +272,12 @@ version = "1.15.1"
 [[deps.Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+version = "1.11.0"
 
 [[deps.DocStringExtensions]]
-git-tree-sha1 = "e7b7e6f178525d17c720ab9c081e4ef04429f860"
+git-tree-sha1 = "7442a5dfe1ebb773c29cc2962a8980f47221d76c"
 uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.9.4"
+version = "0.9.5"
 
 [[deps.DoubleArrayTries]]
 deps = ["OffsetArrays", "Preferences", "StringViews"]
@@ -291,6 +310,7 @@ version = "0.1.1"
 
 [[deps.FileWatching]]
 uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
+version = "1.11.0"
 
 [[deps.FillArrays]]
 deps = ["LinearAlgebra"]
@@ -309,10 +329,10 @@ version = "1.13.0"
     Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [[deps.Flux]]
-deps = ["Adapt", "ChainRulesCore", "Compat", "Functors", "LinearAlgebra", "MLUtils", "MacroTools", "NNlib", "OneHotArrays", "Optimisers", "Preferences", "ProgressLogging", "Random", "Reexport", "Setfield", "SparseArrays", "SpecialFunctions", "Statistics", "Zygote"]
-git-tree-sha1 = "f3b4e96288e8bee94cd1e230c1a9387d3e03f788"
+deps = ["Adapt", "ChainRulesCore", "Compat", "Functors", "LinearAlgebra", "MLDataDevices", "MLUtils", "MacroTools", "NNlib", "OneHotArrays", "Optimisers", "Preferences", "ProgressLogging", "Random", "Reexport", "Setfield", "SparseArrays", "SpecialFunctions", "Statistics", "Zygote"]
+git-tree-sha1 = "df520a0727f843576801a0294f5be1a94be28e23"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.14.21"
+version = "0.14.25"
 
     [deps.Flux.extensions]
     FluxAMDGPUExt = "AMDGPU"
@@ -321,14 +341,12 @@ version = "0.14.21"
     FluxEnzymeExt = "Enzyme"
     FluxMPIExt = "MPI"
     FluxMPINCCLExt = ["CUDA", "MPI", "NCCL"]
-    FluxMetalExt = "Metal"
 
     [deps.Flux.weakdeps]
     AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
     CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
     Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
     MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-    Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
     NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
     cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
@@ -356,18 +374,19 @@ version = "0.4.12"
 [[deps.Future]]
 deps = ["Random"]
 uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
+version = "1.11.0"
 
 [[deps.GPUArrays]]
-deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
-git-tree-sha1 = "04661708f5301394a1f1be86a07a89e835900db6"
+deps = ["Adapt", "GPUArraysCore", "KernelAbstractions", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "ScopedValues", "Serialization", "Statistics"]
+git-tree-sha1 = "eea7b3a1964b4de269bb380462a9da604be7fcdb"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "10.2.3"
+version = "11.2.2"
 
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
-git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950"
+git-tree-sha1 = "83cf05ab16a73219e5f6bd1bdfa9848fa24ac627"
 uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
-version = "0.1.6"
+version = "0.2.0"
 
 [[deps.HTML_Entities]]
 deps = ["StrTables"]
@@ -400,6 +419,7 @@ version = "0.3.1"
 [[deps.InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+version = "1.11.0"
 
 [[deps.InverseFunctions]]
 git-tree-sha1 = "a779299d77cd080bf77b97535acecd73e1c5e5cb"
@@ -445,6 +465,12 @@ version = "1.14.3"
     [deps.JSON3.weakdeps]
     ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
 
+[[deps.JuliaInterpreter]]
+deps = ["CodeTracking", "InteractiveUtils", "Random", "UUIDs"]
+git-tree-sha1 = "6ac9e4acc417a5b534ace12690bc6973c25b862f"
+uuid = "aa1ae85d-cabe-5617-a682-6adf51b2e16a"
+version = "0.10.3"
+
 [[deps.JuliaVariables]]
 deps = ["MLStyle", "NameResolution"]
 git-tree-sha1 = "49fb3cb53362ddadb4415e9b73926d6b40709e70"
@@ -468,10 +494,10 @@ version = "0.9.34"
     SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[deps.LLVM]]
-deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"]
-git-tree-sha1 = "020abd49586480c1be84f57da0017b5d3db73f7c"
+deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Unicode"]
+git-tree-sha1 = "5e8b243b2e4c86648dac82cf767ae1456000b92d"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "8.0.0"
+version = "9.4.0"
 
     [deps.LLVM.extensions]
     BFloat16sExt = "BFloat16s"
@@ -481,9 +507,9 @@ version = "8.0.0"
 
 [[deps.LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
-git-tree-sha1 = "c2636c264861edc6d305e6b4d528f09566d24c5e"
+git-tree-sha1 = "f8022e2c8b5eef5f30e7fb2fe52c97cc5674db23"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.30+0"
+version = "0.0.36+0"
 
 [[deps.LRUCache]]
 git-tree-sha1 = "5519b95a490ff5fe629c4a7aa3b3dfc9160498b3"
@@ -503,38 +529,47 @@ version = "0.4.6"
 [[deps.LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
 uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
+version = "1.11.0"
 
 [[deps.LibCURL]]
 deps = ["LibCURL_jll", "MozillaCACerts_jll"]
 uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
-version = "0.6.3"
+version = "0.6.4"
 
 [[deps.LibCURL_jll]]
 deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
 uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-version = "7.84.0+0"
+version = "8.6.0+0"
 
 [[deps.LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+version = "1.11.0"
+
+[[deps.LibGit2_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
+uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
+version = "1.7.2+0"
 
 [[deps.LibSSH2_jll]]
 deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
 uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
-version = "1.10.2+0"
+version = "1.11.0+1"
 
 [[deps.Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+version = "1.11.0"
 
 [[deps.LinearAlgebra]]
 deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+version = "1.11.0"
 
 [[deps.LogExpFunctions]]
 deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
-git-tree-sha1 = "a2d09619db4e765091ee5c6ffe8872849de0feea"
+git-tree-sha1 = "13ca9e2586b89836fd20cccf56e57e2b9ae7f38f"
 uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
-version = "0.3.28"
+version = "0.3.29"
 
     [deps.LogExpFunctions.extensions]
     LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
@@ -548,6 +583,7 @@ version = "0.3.28"
 
 [[deps.Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+version = "1.11.0"
 
 [[deps.LoggingExtras]]
 deps = ["Dates", "Logging"]
@@ -555,12 +591,62 @@ git-tree-sha1 = "f02b56007b064fbfddb4c9cd60161b6dd0f40df3"
 uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36"
 version = "1.1.0"
 
+[[deps.LoweredCodeUtils]]
+deps = ["JuliaInterpreter"]
+git-tree-sha1 = "4ef1c538614e3ec30cb6383b9eb0326a5c3a9763"
+uuid = "6f1432cf-f94c-5a45-995e-cdbf5db27b0b"
+version = "3.3.0"
+
 [[deps.MLCore]]
 deps = ["DataAPI", "SimpleTraits", "Tables"]
 git-tree-sha1 = "73907695f35bc7ffd9f11f6c4f2ee8c1302084be"
 uuid = "c2834f40-e789-41da-a90e-33b280584a8c"
 version = "1.0.0"
 
+[[deps.MLDataDevices]]
+deps = ["Adapt", "Compat", "Functors", "Preferences", "Random"]
+git-tree-sha1 = "85b47bc5a8bf0c886286638585df3bec7c9f8269"
+uuid = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
+version = "1.5.3"
+
+    [deps.MLDataDevices.extensions]
+    MLDataDevicesAMDGPUExt = "AMDGPU"
+    MLDataDevicesCUDAExt = "CUDA"
+    MLDataDevicesChainRulesCoreExt = "ChainRulesCore"
+    MLDataDevicesChainRulesExt = "ChainRules"
+    MLDataDevicesFillArraysExt = "FillArrays"
+    MLDataDevicesGPUArraysExt = "GPUArrays"
+    MLDataDevicesMLUtilsExt = "MLUtils"
+    MLDataDevicesMetalExt = ["GPUArrays", "Metal"]
+    MLDataDevicesOneHotArraysExt = "OneHotArrays"
+    MLDataDevicesReactantExt = "Reactant"
+    MLDataDevicesRecursiveArrayToolsExt = "RecursiveArrayTools"
+    MLDataDevicesReverseDiffExt = "ReverseDiff"
+    MLDataDevicesSparseArraysExt = "SparseArrays"
+    MLDataDevicesTrackerExt = "Tracker"
+    MLDataDevicesZygoteExt = "Zygote"
+    MLDataDevicescuDNNExt = ["CUDA", "cuDNN"]
+    MLDataDevicesoneAPIExt = ["GPUArrays", "oneAPI"]
+
+    [deps.MLDataDevices.weakdeps]
+    AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+    ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
+    GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+    MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+    Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+    OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
+    Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
+    RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
+    ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
+    SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+    Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+    Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+    cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+    oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+
 [[deps.MLStyle]]
 git-tree-sha1 = "bc38dff0548128765760c79eb7388a4b37fae2c8"
 uuid = "d8e11817-5142-5d16-987a-aa16d5891078"
@@ -580,6 +666,7 @@ version = "0.5.16"
 [[deps.Markdown]]
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+version = "1.11.0"
 
 [[deps.MbedTLS]]
 deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "NetworkOptions", "Random", "Sockets"]
@@ -590,7 +677,7 @@ version = "1.1.9"
 [[deps.MbedTLS_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-version = "2.28.2+0"
+version = "2.28.6+0"
 
 [[deps.MicroCollections]]
 deps = ["Accessors", "BangBang", "InitialValues"]
@@ -606,10 +693,11 @@ version = "1.2.0"
 
 [[deps.Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+version = "1.11.0"
 
 [[deps.MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2022.10.11"
+version = "2023.12.12"
 
 [[deps.NNlib]]
 deps = ["Adapt", "Atomix", "ChainRulesCore", "GPUArraysCore", "KernelAbstractions", "LinearAlgebra", "Random", "ScopedValues", "Statistics"]
@@ -637,9 +725,9 @@ version = "0.9.30"
 
 [[deps.NaNMath]]
 deps = ["OpenLibm_jll"]
-git-tree-sha1 = "030ea22804ef91648f29b7ad3fc15fa49d0e6e71"
+git-tree-sha1 = "9b8215b1ee9e78a293f99797cd31375471b2bcae"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "1.0.3"
+version = "1.1.3"
 
 [[deps.NameResolution]]
 deps = ["PrettyPrint"]
@@ -662,25 +750,25 @@ weakdeps = ["Adapt"]
 
 [[deps.OneHotArrays]]
 deps = ["Adapt", "ChainRulesCore", "Compat", "GPUArraysCore", "LinearAlgebra", "NNlib"]
-git-tree-sha1 = "3685584454b04cd52169c787ba4d196da8a04d10"
+git-tree-sha1 = "bfe8e84c71972f77e775f75e6d8048ad3fdbe8bc"
 uuid = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
-version = "0.2.9"
+version = "0.2.10"
 
 [[deps.OpenBLAS_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.21+4"
+version = "0.3.27+1"
 
 [[deps.OpenLibm_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
-version = "0.8.1+0"
+version = "0.8.1+4"
 
 [[deps.OpenSSL]]
 deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"]
-git-tree-sha1 = "38cb508d080d21dc1128f7fb04f20387ed4c0af4"
+git-tree-sha1 = "f1a7e086c677df53e064e0fdd2c9d0b0833e3f6e"
 uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c"
-version = "1.4.3"
+version = "1.5.0"
 
 [[deps.OpenSSL_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
@@ -701,9 +789,9 @@ uuid = "3bd65402-5787-11e9-1adc-39752487f4e2"
 version = "0.3.4"
 
 [[deps.OrderedCollections]]
-git-tree-sha1 = "cc4054e898b852042d7b503313f7ad03de99c3dd"
+git-tree-sha1 = "05868e21324cede2207c6f0f466b4bfef6d5e7ee"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.8.0"
+version = "1.8.1"
 
 [[deps.Parsers]]
 deps = ["Dates", "PrecompileTools", "UUIDs"]
@@ -718,9 +806,13 @@ uuid = "570af359-4316-4cb7-8c74-252c00c2016b"
 version = "1.2.1"
 
 [[deps.Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "Random", "SHA", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.9.2"
+version = "1.11.0"
+weakdeps = ["REPL"]
+
+    [deps.Pkg.extensions]
+    REPLExt = "REPL"
 
 [[deps.PrecompileTools]]
 deps = ["Preferences"]
@@ -748,6 +840,7 @@ version = "0.1.4"
 [[deps.Printf]]
 deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+version = "1.11.0"
 
 [[deps.ProgressLogging]]
 deps = ["Logging", "SHA", "UUIDs"]
@@ -761,12 +854,14 @@ uuid = "43287f4e-b6f4-7ad1-bb20-aadabca52c3d"
 version = "1.3.0"
 
 [[deps.REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
+deps = ["InteractiveUtils", "Markdown", "Sockets", "StyledStrings", "Unicode"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+version = "1.11.0"
 
 [[deps.Random]]
-deps = ["SHA", "Serialization"]
+deps = ["SHA"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+version = "1.11.0"
 
 [[deps.RealDot]]
 deps = ["LinearAlgebra"]
@@ -791,6 +886,16 @@ git-tree-sha1 = "62389eeff14780bfe55195b7204c0d8738436d64"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
 version = "1.3.1"
 
+[[deps.Revise]]
+deps = ["CodeTracking", "FileWatching", "JuliaInterpreter", "LibGit2", "LoweredCodeUtils", "OrderedCollections", "REPL", "Requires", "UUIDs", "Unicode"]
+git-tree-sha1 = "f6f7d30fb0d61c64d0cfe56cf085a7c9e7d5bc80"
+uuid = "295af30f-e4ad-537b-8983-00126c2a3abe"
+version = "3.8.0"
+weakdeps = ["Distributed"]
+
+    [deps.Revise.extensions]
+    DistributedExt = "Distributed"
+
 [[deps.RustRegex]]
 deps = ["rure_jll"]
 git-tree-sha1 = "16be5e710d7b980678ec0d8c61d4c00e9a5591e3"
@@ -815,6 +920,7 @@ version = "1.2.1"
 
 [[deps.Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+version = "1.11.0"
 
 [[deps.Setfield]]
 deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"]
@@ -840,6 +946,7 @@ version = "0.9.4"
 
 [[deps.Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+version = "1.11.0"
 
 [[deps.SortingAlgorithms]]
 deps = ["DataStructures"]
@@ -850,6 +957,7 @@ version = "1.2.1"
 [[deps.SparseArrays]]
 deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+version = "1.11.0"
 
 [[deps.SparseInverseSubset]]
 deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"]
@@ -890,21 +998,26 @@ uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 version = "1.4.3"
 
 [[deps.Statistics]]
-deps = ["LinearAlgebra", "SparseArrays"]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "ae3bb1eb3bba077cd276bc5cfc337cc65c3075c0"
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-version = "1.9.0"
+version = "1.11.1"
+weakdeps = ["SparseArrays"]
+
+    [deps.Statistics.extensions]
+    SparseArraysExt = ["SparseArrays"]
 
 [[deps.StatsAPI]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "1ff449ad350c9c4cbc756624d6f8a8c3ef56d3ed"
+git-tree-sha1 = "9d72a13a3f4dd3795a195ac5a44d7d6ff5f552ff"
 uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
-version = "1.7.0"
+version = "1.7.1"
 
 [[deps.StatsBase]]
 deps = ["AliasTables", "DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
-git-tree-sha1 = "29321314c920c26684834965ec2ce0dacc9cf8e5"
+git-tree-sha1 = "b81c5035922cc89c2d9523afc6c54be512411466"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.34.4"
+version = "0.34.5"
 
 [[deps.StrTables]]
 deps = ["Dates"]
@@ -919,14 +1032,15 @@ version = "1.3.4"
 
 [[deps.StructArrays]]
 deps = ["ConstructionBase", "DataAPI", "Tables"]
-git-tree-sha1 = "f4dc295e983502292c4c3f951dbb4e985e35b3be"
+git-tree-sha1 = "8ad2e38cbb812e29348719cc63580ec1dfeb9de4"
 uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
-version = "0.6.18"
-weakdeps = ["Adapt", "GPUArraysCore", "SparseArrays", "StaticArrays"]
+version = "0.7.1"
+weakdeps = ["Adapt", "GPUArraysCore", "KernelAbstractions", "LinearAlgebra", "SparseArrays", "StaticArrays"]
 
     [deps.StructArrays.extensions]
     StructArraysAdaptExt = "Adapt"
-    StructArraysGPUArraysCoreExt = "GPUArraysCore"
+    StructArraysGPUArraysCoreExt = ["GPUArraysCore", "KernelAbstractions"]
+    StructArraysLinearAlgebraExt = "LinearAlgebra"
     StructArraysSparseArraysExt = "SparseArrays"
     StructArraysStaticArraysExt = "StaticArrays"
 
@@ -942,14 +1056,18 @@ git-tree-sha1 = "ef626534f40a9d99b3dafdbd54cfe411ad86e3b8"
 uuid = "31cdf514-beb7-4750-89db-dda9d2eb8d3d"
 version = "0.2.1"
 
+[[deps.StyledStrings]]
+uuid = "f489334b-da3d-4c2e-b8f0-e476e12c162b"
+version = "1.11.0"
+
 [[deps.SuiteSparse]]
 deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
 uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 
 [[deps.SuiteSparse_jll]]
-deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
+deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
 uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
-version = "5.10.1+6"
+version = "7.7.0+0"
 
 [[deps.TOML]]
 deps = ["Dates"]
@@ -964,9 +1082,9 @@ version = "1.0.1"
 
 [[deps.Tables]]
 deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "OrderedCollections", "TableTraits"]
-git-tree-sha1 = "598cd7c1f68d1e205689b1c2fe65a9f85846f297"
+git-tree-sha1 = "f2c1efbc8f3a609aadf318094f8fc5204bdaf344"
 uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
-version = "1.12.0"
+version = "1.12.1"
 
 [[deps.Tar]]
 deps = ["ArgTools", "SHA"]
@@ -976,6 +1094,7 @@ version = "1.10.0"
 [[deps.Test]]
 deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+version = "1.11.0"
 
 [[deps.TextEncodeBase]]
 deps = ["DataStructures", "DoubleArrayTries", "FuncPipelines", "PartialFunctions", "PrimitiveOneHot", "RustRegex", "StaticArrays", "StructWalk", "Unicode", "WordTokenizers"]
@@ -1018,14 +1137,20 @@ version = "1.5.2"
 [[deps.UUIDs]]
 deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+version = "1.11.0"
 
 [[deps.Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+version = "1.11.0"
 
 [[deps.UnsafeAtomics]]
-git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
+git-tree-sha1 = "b13c4edda90890e5b04ba24e20a310fbe6f249ff"
 uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
-version = "0.2.1"
+version = "0.3.0"
+weakdeps = ["LLVM"]
+
+    [deps.UnsafeAtomics.extensions]
+    UnsafeAtomicsLLVM = ["LLVM"]
 
 [[deps.WordTokenizers]]
 deps = ["DataDeps", "HTML_Entities", "StrTables", "Unicode"]
@@ -1036,7 +1161,7 @@ version = "0.5.6"
 [[deps.Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.13+0"
+version = "1.2.13+1"
 
 [[deps.Zygote]]
 deps = ["AbstractFFTs", "ChainRules", "ChainRulesCore", "DiffRules", "Distributed", "FillArrays", "ForwardDiff", "GPUArrays", "GPUArraysCore", "IRTools", "InteractiveUtils", "LinearAlgebra", "LogExpFunctions", "MacroTools", "NaNMath", "PrecompileTools", "Random", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "ZygoteRules"]
@@ -1063,17 +1188,17 @@ version = "0.2.7"
 [[deps.libblastrampoline_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.8.0+0"
+version = "5.11.0+0"
 
 [[deps.nghttp2_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
-version = "1.48.0+0"
+version = "1.59.0+0"
 
 [[deps.p7zip_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
-version = "17.4.0+0"
+version = "17.4.0+2"
 
 [[deps.rure_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
diff --git a/Project.toml b/Project.toml
index 6807277..95270ef 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
@@ -23,6 +24,9 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
+BytePairEncoding = "0.5.2"
+Revise = "3.8.0"
+TextEncodeBase = "0.8.3"
 Unicode = ">=1.10.8, <1.12"
 julia = ">=1.9"
 
diff --git a/src/TextSpace.jl b/src/TextSpace.jl
index cdd3a91..145c730 100644
--- a/src/TextSpace.jl
+++ b/src/TextSpace.jl
@@ -29,10 +29,11 @@ resource(name) = joinpath(@__DIR__, "resources", name) # e.g. "gpt2_merges.txt"
 include("preprocessing/__init__.jl")
 include("utils/__init__.jl") 
 
+
 include(joinpath(@__DIR__, "pipeline", "Pipeline.jl"))
 # #now use
+@reexport using .Plumbing
 @reexport using .Pipeline
-
 # high-level embeddings
 # include(joinpath(@__DIR__, "embeddings", "CharacterEmbeddings.jl"))
 # @reexport using .CharacterEmbeddings 
diff --git a/src/preprocessing/CleanText.jl b/src/preprocessing/CleanText.jl
index b79f054..1bd90fd 100644
--- a/src/preprocessing/CleanText.jl
+++ b/src/preprocessing/CleanText.jl
@@ -106,18 +106,12 @@ Strip all combining diacritical marks (Unicode category *Mn*) from `text`
 while leaving base characters intact.  Works on Julia 1.6 - 1.11.
 """
 function remove_accents(text::AbstractString)::String
+    # normalize to NFD to separate characters from their accents
     nfd = Unicode.normalize(text, :NFD)
-
-    if isdefined(Unicode, :combining_class)      # >= 1.10 fast path
-        io = IOBuffer()
-        @inbounds for c in nfd
-            Unicode.combining_class(c) == 0 && write(io, c)
-        end
-        return Unicode.normalize(String(take!(io)), :NFC)
-    else                                          # 1.6 - 1.9 fallback
-        stripped = replace(nfd, r"\p{Mn}" => "")
-        return Unicode.normalize(stripped, :NFC)
-    end
+    # remove all combining diacritical marks using regex
+    stripped = replace(nfd, r"\p{Mn}" => "")
+    # normalize back to NFC for canonical representation
+    return Unicode.normalize(stripped, :NFC)
 end
 
 
diff --git a/src/preprocessing/__init__.jl b/src/preprocessing/__init__.jl
index 070e031..0787c1d 100644
--- a/src/preprocessing/__init__.jl
+++ b/src/preprocessing/__init__.jl
@@ -1,6 +1,5 @@
 module Plumbing
 
-
 include("CleanText.jl")
 include("TextNormalization.jl")
 include("Tokenization.jl")
@@ -8,12 +7,14 @@ include("CharProcessing.jl")
 include("SentenceProcessing.jl")
 include("ParagraphProcessing.jl")
 
-
 export clean_text, strip_zero_width, normalize_whitespace,
-       tokenize, tokenize_batch,
-       tokenize_char,
-       split_sentences,
-       split_paragraphs,
-       filter_paragraphs
-
-end
+        remove_punctuation, remove_emojis, remove_accents,
+        tokenize, tokenize_batch, unwrap_lines,
+        tokenize_char, char_tokens,
+        split_sentences,
+        split_paragraphs,
+        filter_paragraphs, normalize_unicode, paragraph_windows, 
+        merge_short_paragraphs, _is_blank_paragraph, drop_empty_paragraph, 
+        strip_outer_quotes, SlidingSentenceWindow,
+        basic_tokenize, strip_punctuation, ngrams, WHITESPACE_REGEX
+end
\ No newline at end of file
diff --git a/test/pipeline/__init__.jl b/test/pipeline/__init__.jl
new file mode 100644
index 0000000..ab568f9
--- /dev/null
+++ b/test/pipeline/__init__.jl
@@ -0,0 +1,2 @@
+# load all pipeline test files
+include("preprocessing_pipeline_tests.jl") 
\ No newline at end of file
diff --git a/test/preprocessing/__init__.jl b/test/preprocessing/__init__.jl
new file mode 100644
index 0000000..b5d6c20
--- /dev/null
+++ b/test/preprocessing/__init__.jl
@@ -0,0 +1,8 @@
+# test/preprocessing/__init__.jl
+include("preprocessing_cleantext_tests.jl")
+include("preprocessing_textnormalization_tests.jl")
+include("preprocessing_tokenization_tests.jl")
+include("preprocessing_char_tests.jl")
+include("preprocessing_sentence_tests.jl")
+include("preprocessing_paragraph_tests.jl")
+include("preprocessing_subword_pipeline_tests.jl")
diff --git a/test/preprocessing/preprocessing_char_pipeline_tests.jl b/test/preprocessing/preprocessing_char_pipeline_tests.jl
deleted file mode 100644
index 344921c..0000000
--- a/test/preprocessing/preprocessing_char_pipeline_tests.jl
+++ /dev/null
@@ -1,495 +0,0 @@
-# include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "Preprocessing.jl"))
-
-
-
-# @testset "preprocess_for_char_embeddings" begin
-    
-#     #raw string - new vocabulary
-    
-#     txt  = "Hello 😊"
-#     out  = preprocess_for_char_embeddings(txt)          # build vocab
-
-#     #cleaning left the visible text unchanged
-#     @test out.cleaned_text == txt
-
-#     #tokenisation: default keeps case, drops spaces
-#     @test out.chars == ["H","e","l","l","o"," ","😊"]
-
-#     # <unk> exists and has a valid positive id
-#     @test haskey(out.vocabulary.token2id, "<unk>")
-#     @test out.vocabulary.unk_id >= 1
-
-#     #char_ids are a 1-to-1 mapping of the returned characters
-#     @test out.char_ids ==
-#           [out.vocabulary.token2id[c] for c in out.chars]
-
-    
-#     #re-using an existing vocabulary
-#     txt2 = "Hola"
-#     out2 = preprocess_for_char_embeddings(txt2; vocab = out.vocabulary)
-
-#     #should not create a brand-new Vocabulary object
-#     @test out2.vocabulary === out.vocabulary
-
-#     #all ids are within the known range
-#     @test all(1 ≤ id ≤ length(out.vocabulary.id2token) for id in out2.char_ids)
-
-    
-#     #file-path input & unknown characters fall back to <unk>
-#     mktemp() do path, io          # ← path first, stream second
-#         write(io, "¿Qué?")
-#         close(io)
-    
-#         out3 = preprocess_for_char_embeddings(path; vocab = out.vocabulary)
-    
-#         # first character should be mapped to the unk id
-#         @test out3.char_ids[1] == out.vocabulary.unk_id
-#     end
-# end
-
-
-# @testset "preprocess_for_char_embeddings - additional coverage" begin
-    
-#     #clean_options + char_options flags
-#     txt = "Go :)"
-#     res = preprocess_for_char_embeddings(txt;
-#                                         clean_options = Dict(
-#                                             :case_transform        => :lower,
-#                                             :do_remove_punctuation => true
-#                                         ),
-#                                         char_options  = Dict(:keep_space => true))
-
-#     @test res.cleaned_text == "go"               # no trailing blank after normalisation
-#     @test res.chars == ["g","o"]
-#     @test res.char_ids[1:2] ==
-#         [res.vocabulary.token2id[c] for c in res.chars]     # length 2
-
-
-
-#     # add_new = true really grows the vocabulary and updates counts
-#     base = preprocess_for_char_embeddings("abc")    # fresh vocab
-#     orig_vocab = base.vocabulary
-#     orig_len   = length(orig_vocab.id2token)
-
-#     extra = preprocess_for_char_embeddings("abcx";
-#                                            vocab      = orig_vocab,
-#                                            id_options = Dict(:add_new => true))
-
-#     @test length(orig_vocab.id2token) == orig_len + 1       # x appended
-#     new_id = orig_vocab.token2id["x"]
-#     @test orig_vocab.counts[new_id] == 1                    # counts updated
-#     @test extra.char_ids[end] == new_id
-
-
-    
-#     #min_freq filter in vocab_options
-#     txt_lowfreq = "aaab"                                   # 'a' freq=3, 'b' freq=1
-#     vfilt = preprocess_for_char_embeddings(txt_lowfreq;
-#                                            vocab_options = Dict(:min_freq => 2))
-
-#     @test haskey(vfilt.vocabulary.token2id, "a")            # kept
-#     @test !haskey(vfilt.vocabulary.token2id, "b")           # filtered out
-#     @test vfilt.char_ids[end] == vfilt.vocabulary.unk_id    #'b' - <unk>
-
-
-#     #tmp-file cleanup guard
-#     tmp_path = ""
-#     mktemp() do path, io
-#         tmp_path = path
-#         write(io, "Test")
-#         close(io)
-#         preprocess_for_char_embeddings(path)    # just exercise the call
-#     end
-#     @test !isfile(tmp_path)                     # mktemp has removed the file
-# end
-
-
-# @testset "preprocess_for_char_embeddings - edge cases" begin
-
-#     #mixed accents + uppercase transform + accent-stripping
-#     txt_acc = "Áaá\n"
-#     out_acc = preprocess_for_char_embeddings(txt_acc;
-#                                             clean_options = Dict(
-#                                                 :case_transform     => :upper,
-#                                                 :do_remove_accents  => true,
-#                                                 :unicode_normalize  => true
-#                                             ))
-
-#     expected_clean = "AAA"                     # three letters, newline removed
-#     expected_chars = ["A","A","A"]
-
-#     @test out_acc.cleaned_text == expected_clean
-#     @test out_acc.chars == expected_chars
-#     @test out_acc.char_ids == [out_acc.vocabulary.token2id["A"] for _ in 1:3]
-
-
-    
-#     #user-supplied special tokens (deduplicated, kept up-front)
-    
-#     res_spec = preprocess_for_char_embeddings("xy";
-#                     vocab_options = Dict(:special_tokens => ["<pad>", "<unk>"]))
-
-#     @test res_spec.vocabulary.id2token[1:2] == ["<pad>", "<unk>"]
-#     # no duplicates even if we supply <unk> again
-#     res_spec2 = preprocess_for_char_embeddings("xy";
-#                      vocab_options = Dict(:special_tokens => ["<unk>", "<unk>"]))
-#     @test res_spec2.vocabulary.id2token[1] == "<unk>" &&
-#           length(res_spec2.vocabulary.id2token[1:2]) == 2   # some non-<unk> token occupies slot 2
-
-
-
-    
-#     #update_counts = false leaves counts unchanged
-    
-#     base2   = preprocess_for_char_embeddings("zzz")
-#     vocab2  = base2.vocabulary
-#     counts0 = deepcopy(vocab2.counts)
-
-#     preprocess_for_char_embeddings("zzz";
-#                                    vocab       = vocab2,
-#                                    id_options  = Dict(:update_counts => false))
-
-#     @test vocab2.counts == counts0              # nothing incremented
-
-
-    
-#     #unknown character mapped to <unk> when add_new = false
-#     res_unk = preprocess_for_char_embeddings("§"; vocab = vocab2,
-#                                              id_options = Dict(:add_new => false))
-#     @test res_unk.char_ids[1] == vocab2.unk_id
-
-
-#     #large min_freq: all rarities - <unk>
-#     big_txt = "abcabcabcXYZ"     # X,Y,Z each frequency 1
-#     res_freq = preprocess_for_char_embeddings(big_txt;
-#                     vocab_options = Dict(:min_freq => 3))
-
-#     @test !haskey(res_freq.vocabulary.token2id, "X")
-#     @test res_freq.char_ids[end] == res_freq.vocabulary.unk_id
-# end
-
-
-# @testset "preprocess_for_char_embeddings - large corpus smoke-test" begin
-#     #build a approx 250 kB synthetic corpus and persist it
-    
-#     sentence   = "The quick brown fox jumps over the lazy dog. "
-#     big_text   = repeat(sentence, 5000)              #225 kB
-    
-#     path, io = mktemp()                              # path first, io second
-#     write(io, big_text)
-#     close(io)
-
-#     #run the preprocessing pipeline on that file
-    
-#     res = preprocess_for_char_embeddings(
-#               path;
-#               clean_options = Dict(:case_transform => :lower),
-#               vocab_options = Dict(:min_freq => 100)
-#           )
-
-    
-#     #integrity checks
-#     #cleaned text is non-empty and far shorter than raw only because
-#     # spaces were collapsed - not because the file vanished
-#     @test length(res.cleaned_text) > 100_000
-
-#     #consistent lengths: ids == chars
-#     @test length(res.char_ids) == length(res.chars)
-
-#     #vocabulary should contain more than just <unk>
-#     @test length(res.vocabulary.id2token) > 10
-
-#     #unseen glyph maps to <unk>
-#     unk_res = preprocess_for_char_embeddings("§"; vocab=res.vocabulary,
-#                                              id_options=Dict(:add_new=>false))
-#     @test unk_res.char_ids[1] == res.vocabulary.unk_id
-
-    
-#     #slice into windows and sanity-check
-#     function windowify(ids::Vector{Int}, win::Int, stride::Int)
-#         [ids[i:i+win-1] for i in 1:stride:length(ids)-win+1]
-#     end
-#     windows = windowify(res.char_ids, 128, 64)
-#     @test !isempty(windows)
-#     @test all(length(w) == 128 for w in windows)
-    
-#     #clean up temp-file
-#     rm(path; force=true)
-#     @test !isfile(path)
-# end
-
-
-# @testset "preprocess_for_char_embeddings - real text download" begin
-#     #download Alice's Adventures in Wonderland (150 kB)
-    
-#     url  = "https://www.gutenberg.org/cache/epub/11/pg11.txt"
-#     path = tempname() * ".txt"
-
-#     try
-#         Downloads.download(url, path)
-#     catch e
-#         @info "Network unavailable - skipping download test" exception = e
-#         return                          # skip the entire test-set
-#     end
-
-#     #preprocess the downloaded file
-#     out = preprocess_for_char_embeddings(
-#               path;
-#               clean_options = Dict(:case_transform => :lower),
-#               vocab_options = Dict(:min_freq => 10)      # keep common chars
-#           )
-
-    
-#     #logic checks
-#     @test length(out.cleaned_text) > 100_000
-#     @test length(out.chars) == length(Unicode.graphemes(out.cleaned_text))
-#     @test all(c in keys(out.vocabulary.token2id) for c in ["a","e","t"]) 
-
-#     res_unk = preprocess_for_char_embeddings("§";
-#                   vocab = out.vocabulary,
-#                   id_options = Dict(:add_new => false))
-#     @test res_unk.char_ids[1] == out.vocabulary.unk_id   # 3d
-
-#     #clean up
-#     rm(path; force = true)
-#     @test !isfile(path)
-# end
-
-
-# @testset "preprocess_for_char_embeddings - option matrix" begin
-#     #cleaning / whitespace / punctuation / accent flags
-#     raw = "Café   \t\n🚀!!  "    # accents, repeated blanks, emoji, punct
-
-#     clean_opts = Dict(
-#         :case_transform        => :lower,
-#         :do_remove_punctuation => true,
-#         :do_remove_accents     => true,
-#         :collapse_whitespace   => true,     # turn runs of blanks -> one space
-#     )
-#     char_opts  = Dict(:keep_space => false) # drop the single space we kept
-#     outA = preprocess_for_char_embeddings(raw;
-#                 from_file      = false,
-#                 clean_options  = clean_opts,
-#                 char_options   = char_opts)
-
-#     @test outA.cleaned_text == "cafe 🚀"          # accent stripped, blanks to 1, punct gone
-#     @test outA.chars == ["c","a","f","e","🚀"]    # no space token
-#     @test length(outA.char_ids) == 5
-
-#     #space-keeping + Unicode-normalisation left intact
-#     raw2 = "Fiancée " * "\u202F" * "Ωmega"        # NARROW NBSP between words
-#     outB = preprocess_for_char_embeddings(raw2;
-#                 clean_options = Dict(:unicode_normalize => true),  # NFC default
-#                 char_options  = Dict(:keep_space => true))
-
-#     @test " " in outB.chars                      # space token kept
-#     @test occursin("fiancée", lowercase(outB.cleaned_text))  # NFC preserved é
-
-#     #external vocabulary + add_new / update_counts flags
-#     # make a tiny vocab with <unk>, a, b
-#     tok2id = Dict("<unk>"=>1, "a"=>2, "b"=>3)
-#     id2tok = ["<unk>","a","b"]
-#     extvoc = TextSpace.Preprocessing.Vocabulary(tok2id, id2tok, Dict{Int,Int}(), 1)
-
-#     txtC = "abx"    # 'x' is OOV
-#     outC1 = preprocess_for_char_embeddings(txtC;
-#                  vocab       = extvoc,
-#                  char_options=Dict(:keep_space=>false),
-#                  id_options  = Dict(:add_new=>false, :update_counts=>false))
-
-#     @test outC1.char_ids == [2,3,1]          # x -> unk_id
-#     @test !haskey(extvoc.token2id, "x")      # vocab unchanged
-
-#     # same text, but now allow growth and counting
-#     outC2 = preprocess_for_char_embeddings(txtC;
-#                  vocab       = extvoc,
-#                  id_options  = Dict(:add_new=>true,  :update_counts=>true))
-
-#     @test extvoc.token2id["x"] == length(extvoc.id2token)  # new token inserted
-#     @test outC2.char_ids[end] == extvoc.token2id["x"]
-#     @test extvoc.counts[ extvoc.token2id["x"] ] == 1       # counts updated
-
-#     #file-path input (temp file) + ensure_unk! auto-repairs
-#     mktemp() do path, io
-#         write(io, "§")             # char not in extvoc
-#         close(io)
-
-#         broken = TextSpace.Preprocessing.Vocabulary(Dict("a"=>1), ["a"], Dict{Int,Int}(), 0)
-#         outD   = preprocess_for_char_embeddings(path;
-#                     from_file = true,
-#                     vocab     = broken,            # will create *new* vocab
-#                     id_options= Dict(:add_new=>false))
-
-#         #the pipeline returns a *new* repaired vocabulary
-#         @test outD.vocabulary !== broken
-#         @test outD.vocabulary.unk_id >= 1
-#         @test outD.char_ids[1] == outD.vocabulary.unk_id
-
-
-#         rm(path; force=true)
-#     end
-# end
-
-
-# @testset "preprocess_for_char_embeddings - full option sweep" begin
-#     #cleaning + whitespace + accent/punct/emoji removal
-#     raw = "Café   \t\n🚀!!  —  Ωmega🙂"
-
-#     clean_opts = Dict(
-#         :case_transform        => :lower,
-#         :do_remove_punctuation => true,
-#         :do_remove_symbols     => true,
-#         :do_remove_emojis      => true,
-#         :do_remove_accents     => true,
-#         :collapse_whitespace   => true,   # collapse runs - single space
-#     )
-#     char_opts = Dict(:keep_space => false)
-#     outA = preprocess_for_char_embeddings(raw;
-#                 clean_options = clean_opts,
-#                 char_options  = char_opts,
-#                 from_file     = false)
-
-#     @test occursin("cafe ωmega", outA.cleaned_text)          # accent stripped, lower-cased
-#     @test !occursin('🚀', outA.cleaned_text) && !occursin('🙂', outA.cleaned_text)
-#     @test !occursin('—', outA.cleaned_text)                  # em-dash removed by punctuation flag
-#     @test " " ∉ outA.chars                                   # because keep_space=false
-#     @test length(outA.char_ids) == length(outA.chars)
-
-#     #keep_space = true + NFC normalisation only
-#     raw2 = "Fiancée " * "\u202F" * "Ωmega"   # NARROW NBSP between words
-#     outB = preprocess_for_char_embeddings(raw2;
-#                 clean_options = Dict(:unicode_normalize => true, :case_transform=>:lower),
-#                 char_options  = Dict(:keep_space => true),
-#                 from_file     = false)
-
-#     @test " " in outB.chars
-#     @test occursin("fiancée ωmega", outB.cleaned_text)
-
-#     #external vocabulary + add_new / update_counts
-#     tok2id = Dict("<unk>"=>1, "a"=>2, "b"=>3)
-#     id2tok = ["<unk>","a","b"]
-#     extvoc = TextSpace.Preprocessing.Vocabulary(tok2id, id2tok, Dict{Int,Int}(), 1)
-
-#     # add_new=false  keeps OOV as unk
-#     r1 = preprocess_for_char_embeddings("abx";
-#             vocab      = extvoc,
-#             id_options = Dict(:add_new=>false, :update_counts=>false),
-#             char_options = Dict(:keep_space=>false))
-#     @test r1.char_ids == [2,3,1]
-#     @test !haskey(extvoc.token2id, "x")
-
-#     # add_new=true  extends vocab and updates counts
-#     r2 = preprocess_for_char_embeddings("abx";
-#             vocab      = extvoc,
-#             id_options = Dict(:add_new=>true, :update_counts=>true))
-#     new_id = extvoc.token2id["x"]
-#     @test r2.char_ids[end] == new_id
-#     @test extvoc.counts[new_id] == 1
-
-#     #file input + ensure_unk! auto-repair
-#     mktemp() do path, io
-#         write(io, "§"); close(io)
-
-#         broken = TextSpace.Preprocessing.Vocabulary(Dict("a"=>1), ["a"],
-#                                                     Dict{Int,Int}(), 0)   # unk_id = 0
-
-#         outD = preprocess_for_char_embeddings(path;
-#                     from_file = true,
-#                     vocab     = broken,
-#                     id_options = Dict(:add_new=>false))
-
-#         @test outD.vocabulary !== broken   # got a repaired copy
-#         @test outD.vocabulary.unk_id ≥ 1
-#         @test outD.char_ids[1] == outD.vocabulary.unk_id
-
-#         rm(path; force=true)
-#     end
-
-#     #min_freq filtering
-#     r5 = preprocess_for_char_embeddings("xxxyyZ";
-#             vocab_options = Dict(:min_freq=>2))
-#     @test !haskey(r5.vocabulary.token2id, "Z")
-#     @test r5.char_ids[end] == r5.vocabulary.unk_id
-# end
-
-
-# @testset "preprocess_for_char_embeddings - curated UTF-8 hammer" begin
-#     #paragraph: 8 sentences with new-lines, tabs, ZWSP, emoji,
-#     #  combining marks, bidi controls, ligatures, NBSP, narrow NBSP,
-#     #  and a zero-width joiner sequence
-#     zwsp   = "\u200B"                       # ZERO-WIDTH SPACE
-#     nbsp   = "\u00A0"                       # NBSP
-#     nnbsp  = "\u202F"                       # NARROW NBSP
-#     rle    = "\u202B"                       # RTL EMBEDDING
-#     pdf    = "\u202C"                       # POP DIR. FORMAT
-#     ligfi  = "ﬁ"
-#     combé  = "e\u0301"                      # e + COMBINING ACUTE
-#     famemo = "👨‍👩‍👧‍👦"                    # family emoji with ZWJ
-#     astro  = "👩🏽‍🚀"
-#     para = """
-#     Once upon a  time,\tthere were two cafés.$(nbsp)$(nbsp)
-#     They said: “$(ligfi)\u200Breﬂies?!  No way!”\n
-#     Meanwhile, 数学 is fun; $nnbsp but $(rle)مرحبا بالعالم$(pdf) was written backwards.
-#     Tabs,  spaces,\n\nnew-lines, and $zwsp zero-widths $zwsp hide! $astro went to the 🌖.
-#     $famemo danced in the night… $(combé)!
-#     """
-
-#     #conservative cleaning (NFC only) + keep spaces
-#     outA = preprocess_for_char_embeddings(para;
-#             clean_options = Dict(:unicode_normalize=>true),
-#             char_options  = Dict(:keep_space=>true),
-#             from_file     = false)
-
-#     @test occursin("café", outA.cleaned_text)      # accent preserved
-#     @test '🌖' in outA.cleaned_text
-#     @test '\n' ∉ outA.cleaned_text                 # normalize_whitespace default
-#     @test " " in outA.chars                        # spaces kept
-#     @test length(outA.chars) == length(Unicode.graphemes(outA.cleaned_text))
-
-#     #aggressive emoji + punctuation + accent removal, collapse whitespace, drop spaces
-#     outB = preprocess_for_char_embeddings(para;
-#             clean_options = Dict(
-#                 :do_remove_emojis      => true,
-#                 :do_remove_punctuation => true,
-#                 :do_remove_accents     => true,
-#                 :collapse_whitespace   => true,
-#                 :case_transform        => :lower),
-#             char_options  = Dict(:keep_space=>false),
-#             from_file     = false)
-
-#     @test !occursin('🌖', outB.cleaned_text) && !occursin('👨', outB.cleaned_text)
-#     @test !occursin("¡", outB.cleaned_text)        # punctuation gone
-#     @test !occursin("é", outB.cleaned_text)        # accent stripped
-#     @test !occursin(r"\s\s", outB.cleaned_text)    # no double blanks
-#     @test " " ∉ outB.chars                         # spaces dropped
-#     @test outB.cleaned_text == lowercase(outB.cleaned_text)
-
-#     #symbols removed but punctuation kept; case upper
-#     outC = preprocess_for_char_embeddings(para;
-#             clean_options = Dict(
-#                 :do_remove_symbols     => true,   # removes currency, math, emoji
-#                 :do_remove_emojis      => false,  # but we already stripped symbols
-#                 :case_transform        => :upper),
-#             char_options = Dict(:keep_space=>true),
-#             from_file    = false)
-
-#     @test occursin("CAFÉ", outC.cleaned_text)
-#     @test '🌖' ∉ outC.cleaned_text                 # symbol removed
-#     @test 'É' ∈ outC.cleaned_text                 # accent still there
-#     @test any(c -> isuppercase(c[1]), outC.chars) # uppercase present
-
-#     #high min_freq filters rare glyphs; ensure OOV-><unk>
-#     rare_opts = Dict(:min_freq => 10)
-#     rD  = preprocess_for_char_embeddings(para;
-#             vocab_options = rare_opts,
-#             char_options  = Dict(:keep_space=>false),
-#             from_file     = false)
-
-#     #rare glyph '🌖' should NOT be in the pruned vocabulary
-#     @test !haskey(rD.vocabulary.token2id, "🌖")
-
-#     #every occurrence in the corpus must therefore map to <unk>
-#     @test all(id == rD.vocabulary.unk_id
-#             for (tok,id) in zip(rD.chars, rD.char_ids) if tok == "🌖")
-
-# end
diff --git a/test/preprocessing/preprocessing_char_tests.jl b/test/preprocessing/preprocessing_char_tests.jl
index 5fdb74f..e5a7f9f 100644
--- a/test/preprocessing/preprocessing_char_tests.jl
+++ b/test/preprocessing/preprocessing_char_tests.jl
@@ -1,6 +1,3 @@
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "CharProcessing.jl"))
-
-
 @testset "tokenize_char" begin
     txt = "Café 😊"
 
diff --git a/test/preprocessing/preprocessing_cleantext_tests.jl b/test/preprocessing/preprocessing_cleantext_tests.jl
index 600b4d5..57303e6 100644
--- a/test/preprocessing/preprocessing_cleantext_tests.jl
+++ b/test/preprocessing/preprocessing_cleantext_tests.jl
@@ -1,8 +1,3 @@
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "CleanText.jl"))
-
-using Random, Unicode
-
-
 @testset "remove_punctuation" begin
     txt = "Hello, world! (yes) - test."
     @test remove_punctuation(txt) == "Hello world yes  test"
diff --git a/test/preprocessing/preprocessing_paragraph_tests.jl b/test/preprocessing/preprocessing_paragraph_tests.jl
index 4a9a6cd..6c405a7 100644
--- a/test/preprocessing/preprocessing_paragraph_tests.jl
+++ b/test/preprocessing/preprocessing_paragraph_tests.jl
@@ -1,6 +1,4 @@
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "ParagraphProcessing.jl"))
-
-const PP = TextSpace.Preprocessing
+const PP = TextSpace.Plumbing
 
 
 @testset "unwrap_lines" begin
diff --git a/test/preprocessing/preprocessing_sentence_tests.jl b/test/preprocessing/preprocessing_sentence_tests.jl
index 31eee33..d7ae907 100644
--- a/test/preprocessing/preprocessing_sentence_tests.jl
+++ b/test/preprocessing/preprocessing_sentence_tests.jl
@@ -1,6 +1,3 @@
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "SentenceProcessing.jl"))
-
-
 @testset "split_sentences" begin
     txt = "Dr. Smith went to Washington.  It was rainy!  Was it fun?  Yes."
     sents = split_sentences(txt)
diff --git a/test/preprocessing/preprocessing_subword_pipeline_tests.jl b/test/preprocessing/preprocessing_subword_pipeline_tests.jl
index 69927b4..3013b66 100644
--- a/test/preprocessing/preprocessing_subword_pipeline_tests.jl
+++ b/test/preprocessing/preprocessing_subword_pipeline_tests.jl
@@ -8,7 +8,3 @@
 
 
 
-
-
-
-
diff --git a/test/preprocessing/preprocessing_test_gateway.jl b/test/preprocessing/preprocessing_test_gateway.jl
deleted file mode 100644
index c0cb59f..0000000
--- a/test/preprocessing/preprocessing_test_gateway.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "Tokenization.jl"))
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "CharProcessing.jl"))
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "TextNormalization.jl"))
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "SentenceProcessing.jl"))
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "ParagraphProcessing.jl"))
-
-
-
-#test paragraph processing
-include("preprocessing_paragraph_tests.jl")
-
-
-#test sentence processing
-include("preprocessing_sentence_tests.jl")
-
-
-#test char preprocessing
-include("preprocessing_char_tests.jl")
-
-
-#test the text tokenization
-include("preprocessing_tokenization_tests.jl")
-
-
-#test the textnormalization
-include("preprocessing_textnormalization_tests.jl")
-
-
-#test clean text
-include("preprocessing_cleantext_tests.jl")
-
-
-
-
-
-
-
diff --git a/test/preprocessing/preprocessing_textnormalization_tests.jl b/test/preprocessing/preprocessing_textnormalization_tests.jl
index cce8c81..517cd9f 100644
--- a/test/preprocessing/preprocessing_textnormalization_tests.jl
+++ b/test/preprocessing/preprocessing_textnormalization_tests.jl
@@ -1,6 +1,3 @@
-include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "TextNormalization.jl"))
-
-
 @testset "normalize_unicode" begin
     decomposed = "Cafe\u0301"                 # "Café" (e + COMBINING ACUTE)
     composed   = "Café"                       # NFC form
diff --git a/test/preprocessing/preprocessing_tokenization_tests.jl b/test/preprocessing/preprocessing_tokenization_tests.jl
index 0303547..87ddf59 100644
--- a/test/preprocessing/preprocessing_tokenization_tests.jl
+++ b/test/preprocessing/preprocessing_tokenization_tests.jl
@@ -1,5 +1,3 @@
-
-
 @testset "basic_tokenize" begin
     txt = "Hello,  World!\n"
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 11bcb62..b9efa1a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,25 +1,19 @@
-using TextSpace
 using Test
-
+using TextSpace
 using Random
+using Unicode
 using Downloads 
 
-# include(joinpath(@__DIR__, "..", "src", "preprocessing", "Vocabulary.jl"))
-# include(joinpath(@__DIR__, "..", "src", "preprocessing", "SubwordProcessing.jl"))
-
-# include("SubwordEmbeddings/subword_embeddings_test_gateway.jl")
-# include("WordEmbeddings/word_embeddings_test_gateway.jl")
-# include("CharacterEmbeddings/character_embeddings_test_gateway.jl")
-# include("preprocessing/preprocessing_test_gateway.jl")
-
-# include("util-tests/__init__.jl")
-include("pipeline/preprocessing_pipeline_tests.jl")
-
+@testset "TextSpace.jl Test Suite" begin
+    @testset "Plumbing" begin
+        include("preprocessing/__init__.jl")  # Loads all preprocessing tests
+    end
 
+    @testset "Pipelines" begin
+        include("pipeline/__init__.jl")  # Now loads all pipeline tests uniformly
+    end
 
-@testset "basic root test" begin
-    # Test 1: Default behavior (no punctuation or emoji removal)
-    text1 = "Hello, World!"
-    #only lowercasing and whitespace normalization occur.
-    @test text1 == "Hello, World!"
-end
+    @testset "Basic Tests" begin
+        @test true  # Your basic smoke tests
+    end
+end
\ No newline at end of file