diff --git a/Manifest.toml b/Manifest.toml index f2e5ec9..c990627 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,8 +1,8 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.9.3" +julia_version = "1.11.4" manifest_format = "2.0" -project_hash = "aa82c28e8908bfa177480b2eb90c73573e8c6c54" +project_hash = "8d86f861eb2145d57a00b480d416501d1e0998c6" [[deps.AbstractFFTs]] deps = ["LinearAlgebra"] @@ -17,9 +17,9 @@ weakdeps = ["ChainRulesCore", "Test"] [[deps.Accessors]] deps = ["CompositionsBase", "ConstructionBase", "Dates", "InverseFunctions", "MacroTools"] -git-tree-sha1 = "0ba8f4c1f06707985ffb4804fdad1bf97b233897" +git-tree-sha1 = "3b86719127f50670efe356bc11073d84b4ed7a5d" uuid = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" -version = "0.1.41" +version = "0.1.42" [deps.Accessors.extensions] AxisKeysExt = "AxisKeys" @@ -34,7 +34,6 @@ version = "0.1.41" AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - Requires = "ae029012-a4dd-5104-9daa-d747884805df" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" @@ -58,28 +57,41 @@ uuid = "66dad0bd-aa9a-41b7-9441-69ab47430ed8" version = "1.1.3" [[deps.ArgCheck]] -git-tree-sha1 = "680b3b8759bd4c54052ada14e52355ab69e07876" +git-tree-sha1 = "f9e9a66c9b7be1ad7372bbd9b062d9230c30c5ce" uuid = "dce04be8-c92d-5529-be00-80e4d2c0e197" -version = "2.4.0" +version = "2.5.0" [[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" -version = "1.1.1" +version = "1.1.2" [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +version = "1.11.0" [[deps.Atomix]] deps = ["UnsafeAtomics"] -git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +git-tree-sha1 = "b5bb4dc6248fde467be2a863eb8452993e74d402" uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" -version = "0.1.0" +version = "1.1.1" + + [deps.Atomix.extensions] + AtomixCUDAExt = "CUDA" + AtomixMetalExt = "Metal" + AtomixOpenCLExt = "OpenCL" + AtomixoneAPIExt = "oneAPI" + + [deps.Atomix.weakdeps] + CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + Metal = "dde4c033-4e86-420c-a63e-0dd931031962" + OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2" + oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [[deps.BangBang]] -deps = ["Accessors", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires"] -git-tree-sha1 = "e2144b631226d9eeab2d746ca8880b7ccff504ae" +deps = ["Accessors", "ConstructionBase", "InitialValues", "LinearAlgebra"] +git-tree-sha1 = "26f41e1df02c330c4fa1e98d4aa2168fdafc9b1f" uuid = "198e06fe-97b7-11e9-32a5-e1d131e6ad66" -version = "0.4.3" +version = "0.4.4" [deps.BangBang.extensions] BangBangChainRulesCoreExt = "ChainRulesCore" @@ -99,6 +111,7 @@ version = "0.4.3" [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" +version = "1.11.0" [[deps.Baselet]] git-tree-sha1 = "aebf55e6d7795e02ca500a689d326ac979aaf89e" @@ -113,8 +126,6 @@ version = "0.1.9" [[deps.BytePairEncoding]] deps = ["Artifacts", "Base64", "DataStructures", "DoubleArrayTries", "LRUCache", "LazyArtifacts", "StructWalk", "TextEncodeBase", "Unicode"] git-tree-sha1 = "b8d2edaf190d01d6a1c30b80d1db2d866fbe7371" -repo-rev = "master" -repo-url = "https://github.com/chengchingwen/BytePairEncoding.jl.git" uuid = "a4280ba5-8788-555a-8ca8-4a8c3d966a71" version = "0.5.2" @@ -125,9 +136,9 @@ version = "0.5.0" [[deps.ChainRules]] deps = ["Adapt", "ChainRulesCore", "Compat", "Distributed", "GPUArraysCore", "IrrationalConstants", "LinearAlgebra", "Random", "RealDot", "SparseArrays", "SparseInverseSubset", "Statistics", "StructArrays", "SuiteSparse"] -git-tree-sha1 = "a975ae558af61a2a48720a6271661bf2621e0f4e" +git-tree-sha1 = "204e9b212da5cc7df632b58af8d49763383f47fa" uuid = "082447d4-558c-5d27-93f4-14fc19e9eca2" -version = "1.72.3" +version = "1.72.4" [[deps.ChainRulesCore]] deps = ["Compat", "LinearAlgebra"] @@ -139,6 +150,12 @@ weakdeps = ["SparseArrays"] [deps.ChainRulesCore.extensions] ChainRulesCoreSparseArraysExt = "SparseArrays" +[[deps.CodeTracking]] +deps = ["InteractiveUtils", "UUIDs"] +git-tree-sha1 = "062c5e1a5bf6ada13db96a4ae4749a4c2234f521" +uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2" +version = "1.3.9" + [[deps.CodecZlib]] deps = ["TranscodingStreams", "Zlib_jll"] git-tree-sha1 = "962834c22b66e32aa10f7611c08c8ca4e20749a9" @@ -164,7 +181,7 @@ weakdeps = ["Dates", "LinearAlgebra"] [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "1.0.5+0" +version = "1.1.1+0" [[deps.CompositionsBase]] git-tree-sha1 = "802bb88cd69dfd1509f6670416bd4434015693ad" @@ -227,6 +244,7 @@ version = "1.0.0" [[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" +version = "1.11.0" [[deps.DefineSingletons]] git-tree-sha1 = "0fba8b706d0178b4dc7fd44a96a92382c9065c2c" @@ -254,11 +272,12 @@ version = "1.15.1" [[deps.Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" +version = "1.11.0" [[deps.DocStringExtensions]] -git-tree-sha1 = "e7b7e6f178525d17c720ab9c081e4ef04429f860" +git-tree-sha1 = "7442a5dfe1ebb773c29cc2962a8980f47221d76c" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.9.4" +version = "0.9.5" [[deps.DoubleArrayTries]] deps = ["OffsetArrays", "Preferences", "StringViews"] @@ -291,6 +310,7 @@ version = "0.1.1" [[deps.FileWatching]] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" +version = "1.11.0" [[deps.FillArrays]] deps = ["LinearAlgebra"] @@ -309,10 +329,10 @@ version = "1.13.0" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[deps.Flux]] -deps = ["Adapt", "ChainRulesCore", "Compat", "Functors", "LinearAlgebra", "MLUtils", "MacroTools", "NNlib", "OneHotArrays", "Optimisers", "Preferences", "ProgressLogging", "Random", "Reexport", "Setfield", "SparseArrays", "SpecialFunctions", "Statistics", "Zygote"] -git-tree-sha1 = "f3b4e96288e8bee94cd1e230c1a9387d3e03f788" +deps = ["Adapt", "ChainRulesCore", "Compat", "Functors", "LinearAlgebra", "MLDataDevices", "MLUtils", "MacroTools", "NNlib", "OneHotArrays", "Optimisers", "Preferences", "ProgressLogging", "Random", "Reexport", "Setfield", "SparseArrays", "SpecialFunctions", "Statistics", "Zygote"] +git-tree-sha1 = "df520a0727f843576801a0294f5be1a94be28e23" uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" -version = "0.14.21" +version = "0.14.25" [deps.Flux.extensions] FluxAMDGPUExt = "AMDGPU" @@ -321,14 +341,12 @@ version = "0.14.21" FluxEnzymeExt = "Enzyme" FluxMPIExt = "MPI" FluxMPINCCLExt = ["CUDA", "MPI", "NCCL"] - FluxMetalExt = "Metal" [deps.Flux.weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" - Metal = "dde4c033-4e86-420c-a63e-0dd931031962" NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" @@ -356,18 +374,19 @@ version = "0.4.12" [[deps.Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" +version = "1.11.0" [[deps.GPUArrays]] -deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] -git-tree-sha1 = "04661708f5301394a1f1be86a07a89e835900db6" +deps = ["Adapt", "GPUArraysCore", "KernelAbstractions", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "ScopedValues", "Serialization", "Statistics"] +git-tree-sha1 = "eea7b3a1964b4de269bb380462a9da604be7fcdb" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "10.2.3" +version = "11.2.2" [[deps.GPUArraysCore]] deps = ["Adapt"] -git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950" +git-tree-sha1 = "83cf05ab16a73219e5f6bd1bdfa9848fa24ac627" uuid = "46192b85-c4d5-4398-a991-12ede77f4527" -version = "0.1.6" +version = "0.2.0" [[deps.HTML_Entities]] deps = ["StrTables"] @@ -400,6 +419,7 @@ version = "0.3.1" [[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +version = "1.11.0" [[deps.InverseFunctions]] git-tree-sha1 = "a779299d77cd080bf77b97535acecd73e1c5e5cb" @@ -445,6 +465,12 @@ version = "1.14.3" [deps.JSON3.weakdeps] ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd" +[[deps.JuliaInterpreter]] +deps = ["CodeTracking", "InteractiveUtils", "Random", "UUIDs"] +git-tree-sha1 = "6ac9e4acc417a5b534ace12690bc6973c25b862f" +uuid = "aa1ae85d-cabe-5617-a682-6adf51b2e16a" +version = "0.10.3" + [[deps.JuliaVariables]] deps = ["MLStyle", "NameResolution"] git-tree-sha1 = "49fb3cb53362ddadb4415e9b73926d6b40709e70" @@ -468,10 +494,10 @@ version = "0.9.34" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[deps.LLVM]] -deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"] -git-tree-sha1 = "020abd49586480c1be84f57da0017b5d3db73f7c" +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Unicode"] +git-tree-sha1 = "5e8b243b2e4c86648dac82cf767ae1456000b92d" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "8.0.0" +version = "9.4.0" [deps.LLVM.extensions] BFloat16sExt = "BFloat16s" @@ -481,9 +507,9 @@ version = "8.0.0" [[deps.LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "c2636c264861edc6d305e6b4d528f09566d24c5e" +git-tree-sha1 = "f8022e2c8b5eef5f30e7fb2fe52c97cc5674db23" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.30+0" +version = "0.0.36+0" [[deps.LRUCache]] git-tree-sha1 = "5519b95a490ff5fe629c4a7aa3b3dfc9160498b3" @@ -503,38 +529,47 @@ version = "0.4.6" [[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" +version = "1.11.0" [[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" -version = "0.6.3" +version = "0.6.4" [[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" -version = "7.84.0+0" +version = "8.6.0+0" [[deps.LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" +version = "1.11.0" + +[[deps.LibGit2_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"] +uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5" +version = "1.7.2+0" [[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" -version = "1.10.2+0" +version = "1.11.0+1" [[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" +version = "1.11.0" [[deps.LinearAlgebra]] deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +version = "1.11.0" [[deps.LogExpFunctions]] deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "a2d09619db4e765091ee5c6ffe8872849de0feea" +git-tree-sha1 = "13ca9e2586b89836fd20cccf56e57e2b9ae7f38f" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.28" +version = "0.3.29" [deps.LogExpFunctions.extensions] LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" @@ -548,6 +583,7 @@ version = "0.3.28" [[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" +version = "1.11.0" [[deps.LoggingExtras]] deps = ["Dates", "Logging"] @@ -555,12 +591,62 @@ git-tree-sha1 = "f02b56007b064fbfddb4c9cd60161b6dd0f40df3" uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36" version = "1.1.0" +[[deps.LoweredCodeUtils]] +deps = ["JuliaInterpreter"] +git-tree-sha1 = "4ef1c538614e3ec30cb6383b9eb0326a5c3a9763" +uuid = "6f1432cf-f94c-5a45-995e-cdbf5db27b0b" +version = "3.3.0" + [[deps.MLCore]] deps = ["DataAPI", "SimpleTraits", "Tables"] git-tree-sha1 = "73907695f35bc7ffd9f11f6c4f2ee8c1302084be" uuid = "c2834f40-e789-41da-a90e-33b280584a8c" version = "1.0.0" +[[deps.MLDataDevices]] +deps = ["Adapt", "Compat", "Functors", "Preferences", "Random"] +git-tree-sha1 = "85b47bc5a8bf0c886286638585df3bec7c9f8269" +uuid = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" +version = "1.5.3" + + [deps.MLDataDevices.extensions] + MLDataDevicesAMDGPUExt = "AMDGPU" + MLDataDevicesCUDAExt = "CUDA" + MLDataDevicesChainRulesCoreExt = "ChainRulesCore" + MLDataDevicesChainRulesExt = "ChainRules" + MLDataDevicesFillArraysExt = "FillArrays" + MLDataDevicesGPUArraysExt = "GPUArrays" + MLDataDevicesMLUtilsExt = "MLUtils" + MLDataDevicesMetalExt = ["GPUArrays", "Metal"] + MLDataDevicesOneHotArraysExt = "OneHotArrays" + MLDataDevicesReactantExt = "Reactant" + MLDataDevicesRecursiveArrayToolsExt = "RecursiveArrayTools" + MLDataDevicesReverseDiffExt = "ReverseDiff" + MLDataDevicesSparseArraysExt = "SparseArrays" + MLDataDevicesTrackerExt = "Tracker" + MLDataDevicesZygoteExt = "Zygote" + MLDataDevicescuDNNExt = ["CUDA", "cuDNN"] + MLDataDevicesoneAPIExt = ["GPUArrays", "oneAPI"] + + [deps.MLDataDevices.weakdeps] + AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" + CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2" + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" + GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" + MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" + Metal = "dde4c033-4e86-420c-a63e-0dd931031962" + OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" + Reactant = "3c362404-f566-11ee-1572-e11a4b42c853" + RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd" + ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" + SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" + Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" + cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" + oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + [[deps.MLStyle]] git-tree-sha1 = "bc38dff0548128765760c79eb7388a4b37fae2c8" uuid = "d8e11817-5142-5d16-987a-aa16d5891078" @@ -580,6 +666,7 @@ version = "0.5.16" [[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" +version = "1.11.0" [[deps.MbedTLS]] deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "NetworkOptions", "Random", "Sockets"] @@ -590,7 +677,7 @@ version = "1.1.9" [[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.2+0" +version = "2.28.6+0" [[deps.MicroCollections]] deps = ["Accessors", "BangBang", "InitialValues"] @@ -606,10 +693,11 @@ version = "1.2.0" [[deps.Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" +version = "1.11.0" [[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2022.10.11" +version = "2023.12.12" [[deps.NNlib]] deps = ["Adapt", "Atomix", "ChainRulesCore", "GPUArraysCore", "KernelAbstractions", "LinearAlgebra", "Random", "ScopedValues", "Statistics"] @@ -637,9 +725,9 @@ version = "0.9.30" [[deps.NaNMath]] deps = ["OpenLibm_jll"] -git-tree-sha1 = "030ea22804ef91648f29b7ad3fc15fa49d0e6e71" +git-tree-sha1 = "9b8215b1ee9e78a293f99797cd31375471b2bcae" uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" -version = "1.0.3" +version = "1.1.3" [[deps.NameResolution]] deps = ["PrettyPrint"] @@ -662,25 +750,25 @@ weakdeps = ["Adapt"] [[deps.OneHotArrays]] deps = ["Adapt", "ChainRulesCore", "Compat", "GPUArraysCore", "LinearAlgebra", "NNlib"] -git-tree-sha1 = "3685584454b04cd52169c787ba4d196da8a04d10" +git-tree-sha1 = "bfe8e84c71972f77e775f75e6d8048ad3fdbe8bc" uuid = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" -version = "0.2.9" +version = "0.2.10" [[deps.OpenBLAS_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.21+4" +version = "0.3.27+1" [[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" -version = "0.8.1+0" +version = "0.8.1+4" [[deps.OpenSSL]] deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"] -git-tree-sha1 = "38cb508d080d21dc1128f7fb04f20387ed4c0af4" +git-tree-sha1 = "f1a7e086c677df53e064e0fdd2c9d0b0833e3f6e" uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c" -version = "1.4.3" +version = "1.5.0" [[deps.OpenSSL_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -701,9 +789,9 @@ uuid = "3bd65402-5787-11e9-1adc-39752487f4e2" version = "0.3.4" [[deps.OrderedCollections]] -git-tree-sha1 = "cc4054e898b852042d7b503313f7ad03de99c3dd" +git-tree-sha1 = "05868e21324cede2207c6f0f466b4bfef6d5e7ee" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.8.0" +version = "1.8.1" [[deps.Parsers]] deps = ["Dates", "PrecompileTools", "UUIDs"] @@ -718,9 +806,13 @@ uuid = "570af359-4316-4cb7-8c74-252c00c2016b" version = "1.2.1" [[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "Random", "SHA", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.9.2" +version = "1.11.0" +weakdeps = ["REPL"] + + [deps.Pkg.extensions] + REPLExt = "REPL" [[deps.PrecompileTools]] deps = ["Preferences"] @@ -748,6 +840,7 @@ version = "0.1.4" [[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" +version = "1.11.0" [[deps.ProgressLogging]] deps = ["Logging", "SHA", "UUIDs"] @@ -761,12 +854,14 @@ uuid = "43287f4e-b6f4-7ad1-bb20-aadabca52c3d" version = "1.3.0" [[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +deps = ["InteractiveUtils", "Markdown", "Sockets", "StyledStrings", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" +version = "1.11.0" [[deps.Random]] -deps = ["SHA", "Serialization"] +deps = ["SHA"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +version = "1.11.0" [[deps.RealDot]] deps = ["LinearAlgebra"] @@ -791,6 +886,16 @@ git-tree-sha1 = "62389eeff14780bfe55195b7204c0d8738436d64" uuid = "ae029012-a4dd-5104-9daa-d747884805df" version = "1.3.1" +[[deps.Revise]] +deps = ["CodeTracking", "FileWatching", "JuliaInterpreter", "LibGit2", "LoweredCodeUtils", "OrderedCollections", "REPL", "Requires", "UUIDs", "Unicode"] +git-tree-sha1 = "f6f7d30fb0d61c64d0cfe56cf085a7c9e7d5bc80" +uuid = "295af30f-e4ad-537b-8983-00126c2a3abe" +version = "3.8.0" +weakdeps = ["Distributed"] + + [deps.Revise.extensions] + DistributedExt = "Distributed" + [[deps.RustRegex]] deps = ["rure_jll"] git-tree-sha1 = "16be5e710d7b980678ec0d8c61d4c00e9a5591e3" @@ -815,6 +920,7 @@ version = "1.2.1" [[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +version = "1.11.0" [[deps.Setfield]] deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"] @@ -840,6 +946,7 @@ version = "0.9.4" [[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" +version = "1.11.0" [[deps.SortingAlgorithms]] deps = ["DataStructures"] @@ -850,6 +957,7 @@ version = "1.2.1" [[deps.SparseArrays]] deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +version = "1.11.0" [[deps.SparseInverseSubset]] deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"] @@ -890,21 +998,26 @@ uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" version = "1.4.3" [[deps.Statistics]] -deps = ["LinearAlgebra", "SparseArrays"] +deps = ["LinearAlgebra"] +git-tree-sha1 = "ae3bb1eb3bba077cd276bc5cfc337cc65c3075c0" uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -version = "1.9.0" +version = "1.11.1" +weakdeps = ["SparseArrays"] + + [deps.Statistics.extensions] + SparseArraysExt = ["SparseArrays"] [[deps.StatsAPI]] deps = ["LinearAlgebra"] -git-tree-sha1 = "1ff449ad350c9c4cbc756624d6f8a8c3ef56d3ed" +git-tree-sha1 = "9d72a13a3f4dd3795a195ac5a44d7d6ff5f552ff" uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0" -version = "1.7.0" +version = "1.7.1" [[deps.StatsBase]] deps = ["AliasTables", "DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] -git-tree-sha1 = "29321314c920c26684834965ec2ce0dacc9cf8e5" +git-tree-sha1 = "b81c5035922cc89c2d9523afc6c54be512411466" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.34.4" +version = "0.34.5" [[deps.StrTables]] deps = ["Dates"] @@ -919,14 +1032,15 @@ version = "1.3.4" [[deps.StructArrays]] deps = ["ConstructionBase", "DataAPI", "Tables"] -git-tree-sha1 = "f4dc295e983502292c4c3f951dbb4e985e35b3be" +git-tree-sha1 = "8ad2e38cbb812e29348719cc63580ec1dfeb9de4" uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" -version = "0.6.18" -weakdeps = ["Adapt", "GPUArraysCore", "SparseArrays", "StaticArrays"] +version = "0.7.1" +weakdeps = ["Adapt", "GPUArraysCore", "KernelAbstractions", "LinearAlgebra", "SparseArrays", "StaticArrays"] [deps.StructArrays.extensions] StructArraysAdaptExt = "Adapt" - StructArraysGPUArraysCoreExt = "GPUArraysCore" + StructArraysGPUArraysCoreExt = ["GPUArraysCore", "KernelAbstractions"] + StructArraysLinearAlgebraExt = "LinearAlgebra" StructArraysSparseArraysExt = "SparseArrays" StructArraysStaticArraysExt = "StaticArrays" @@ -942,14 +1056,18 @@ git-tree-sha1 = "ef626534f40a9d99b3dafdbd54cfe411ad86e3b8" uuid = "31cdf514-beb7-4750-89db-dda9d2eb8d3d" version = "0.2.1" +[[deps.StyledStrings]] +uuid = "f489334b-da3d-4c2e-b8f0-e476e12c162b" +version = "1.11.0" + [[deps.SuiteSparse]] deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" [[deps.SuiteSparse_jll]] -deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +deps = ["Artifacts", "Libdl", "libblastrampoline_jll"] uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" -version = "5.10.1+6" +version = "7.7.0+0" [[deps.TOML]] deps = ["Dates"] @@ -964,9 +1082,9 @@ version = "1.0.1" [[deps.Tables]] deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "OrderedCollections", "TableTraits"] -git-tree-sha1 = "598cd7c1f68d1e205689b1c2fe65a9f85846f297" +git-tree-sha1 = "f2c1efbc8f3a609aadf318094f8fc5204bdaf344" uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.12.0" +version = "1.12.1" [[deps.Tar]] deps = ["ArgTools", "SHA"] @@ -976,6 +1094,7 @@ version = "1.10.0" [[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +version = "1.11.0" [[deps.TextEncodeBase]] deps = ["DataStructures", "DoubleArrayTries", "FuncPipelines", "PartialFunctions", "PrimitiveOneHot", "RustRegex", "StaticArrays", "StructWalk", "Unicode", "WordTokenizers"] @@ -1018,14 +1137,20 @@ version = "1.5.2" [[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" +version = "1.11.0" [[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" +version = "1.11.0" [[deps.UnsafeAtomics]] -git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +git-tree-sha1 = "b13c4edda90890e5b04ba24e20a310fbe6f249ff" uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" -version = "0.2.1" +version = "0.3.0" +weakdeps = ["LLVM"] + + [deps.UnsafeAtomics.extensions] + UnsafeAtomicsLLVM = ["LLVM"] [[deps.WordTokenizers]] deps = ["DataDeps", "HTML_Entities", "StrTables", "Unicode"] @@ -1036,7 +1161,7 @@ version = "0.5.6" [[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.13+0" +version = "1.2.13+1" [[deps.Zygote]] deps = ["AbstractFFTs", "ChainRules", "ChainRulesCore", "DiffRules", "Distributed", "FillArrays", "ForwardDiff", "GPUArrays", "GPUArraysCore", "IRTools", "InteractiveUtils", "LinearAlgebra", "LogExpFunctions", "MacroTools", "NaNMath", "PrecompileTools", "Random", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "ZygoteRules"] @@ -1063,17 +1188,17 @@ version = "0.2.7" [[deps.libblastrampoline_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" -version = "5.8.0+0" +version = "5.11.0+0" [[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" -version = "1.48.0+0" +version = "1.59.0+0" [[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "17.4.0+0" +version = "17.4.0+2" [[deps.rure_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] diff --git a/Project.toml b/Project.toml index 6807277..95270ef 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" @@ -23,6 +24,9 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] +BytePairEncoding = "0.5.2" +Revise = "3.8.0" +TextEncodeBase = "0.8.3" Unicode = ">=1.10.8, <1.12" julia = ">=1.9" diff --git a/src/TextSpace.jl b/src/TextSpace.jl index cdd3a91..145c730 100644 --- a/src/TextSpace.jl +++ b/src/TextSpace.jl @@ -29,10 +29,11 @@ resource(name) = joinpath(@__DIR__, "resources", name) # e.g. "gpt2_merges.txt" include("preprocessing/__init__.jl") include("utils/__init__.jl") + include(joinpath(@__DIR__, "pipeline", "Pipeline.jl")) # #now use +@reexport using .Plumbing @reexport using .Pipeline - # high-level embeddings # include(joinpath(@__DIR__, "embeddings", "CharacterEmbeddings.jl")) # @reexport using .CharacterEmbeddings diff --git a/src/preprocessing/CleanText.jl b/src/preprocessing/CleanText.jl index b79f054..1bd90fd 100644 --- a/src/preprocessing/CleanText.jl +++ b/src/preprocessing/CleanText.jl @@ -106,18 +106,12 @@ Strip all combining diacritical marks (Unicode category *Mn*) from `text` while leaving base characters intact. Works on Julia 1.6 - 1.11. """ function remove_accents(text::AbstractString)::String + # normalize to NFD to separate characters from their accents nfd = Unicode.normalize(text, :NFD) - - if isdefined(Unicode, :combining_class) # >= 1.10 fast path - io = IOBuffer() - @inbounds for c in nfd - Unicode.combining_class(c) == 0 && write(io, c) - end - return Unicode.normalize(String(take!(io)), :NFC) - else # 1.6 - 1.9 fallback - stripped = replace(nfd, r"\p{Mn}" => "") - return Unicode.normalize(stripped, :NFC) - end + # remove all combining diacritical marks using regex + stripped = replace(nfd, r"\p{Mn}" => "") + # normalize back to NFC for canonical representation + return Unicode.normalize(stripped, :NFC) end diff --git a/src/preprocessing/__init__.jl b/src/preprocessing/__init__.jl index 070e031..0787c1d 100644 --- a/src/preprocessing/__init__.jl +++ b/src/preprocessing/__init__.jl @@ -1,6 +1,5 @@ module Plumbing - include("CleanText.jl") include("TextNormalization.jl") include("Tokenization.jl") @@ -8,12 +7,14 @@ include("CharProcessing.jl") include("SentenceProcessing.jl") include("ParagraphProcessing.jl") - export clean_text, strip_zero_width, normalize_whitespace, - tokenize, tokenize_batch, - tokenize_char, - split_sentences, - split_paragraphs, - filter_paragraphs - -end + remove_punctuation, remove_emojis, remove_accents, + tokenize, tokenize_batch, unwrap_lines, + tokenize_char, char_tokens, + split_sentences, + split_paragraphs, + filter_paragraphs, normalize_unicode, paragraph_windows, + merge_short_paragraphs, _is_blank_paragraph, drop_empty_paragraph, + strip_outer_quotes, SlidingSentenceWindow, + basic_tokenize, strip_punctuation, ngrams, WHITESPACE_REGEX +end \ No newline at end of file diff --git a/test/pipeline/__init__.jl b/test/pipeline/__init__.jl new file mode 100644 index 0000000..ab568f9 --- /dev/null +++ b/test/pipeline/__init__.jl @@ -0,0 +1,2 @@ +# load all pipeline test files +include("preprocessing_pipeline_tests.jl") \ No newline at end of file diff --git a/test/preprocessing/__init__.jl b/test/preprocessing/__init__.jl new file mode 100644 index 0000000..b5d6c20 --- /dev/null +++ b/test/preprocessing/__init__.jl @@ -0,0 +1,8 @@ +# test/preprocessing/__init__.jl +include("preprocessing_cleantext_tests.jl") +include("preprocessing_textnormalization_tests.jl") +include("preprocessing_tokenization_tests.jl") +include("preprocessing_char_tests.jl") +include("preprocessing_sentence_tests.jl") +include("preprocessing_paragraph_tests.jl") +include("preprocessing_subword_pipeline_tests.jl") diff --git a/test/preprocessing/preprocessing_char_pipeline_tests.jl b/test/preprocessing/preprocessing_char_pipeline_tests.jl deleted file mode 100644 index 344921c..0000000 --- a/test/preprocessing/preprocessing_char_pipeline_tests.jl +++ /dev/null @@ -1,495 +0,0 @@ -# include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "Preprocessing.jl")) - - - -# @testset "preprocess_for_char_embeddings" begin - -# #raw string - new vocabulary - -# txt = "Hello 😊" -# out = preprocess_for_char_embeddings(txt) # build vocab - -# #cleaning left the visible text unchanged -# @test out.cleaned_text == txt - -# #tokenisation: default keeps case, drops spaces -# @test out.chars == ["H","e","l","l","o"," ","😊"] - -# # exists and has a valid positive id -# @test haskey(out.vocabulary.token2id, "") -# @test out.vocabulary.unk_id >= 1 - -# #char_ids are a 1-to-1 mapping of the returned characters -# @test out.char_ids == -# [out.vocabulary.token2id[c] for c in out.chars] - - -# #re-using an existing vocabulary -# txt2 = "Hola" -# out2 = preprocess_for_char_embeddings(txt2; vocab = out.vocabulary) - -# #should not create a brand-new Vocabulary object -# @test out2.vocabulary === out.vocabulary - -# #all ids are within the known range -# @test all(1 ≤ id ≤ length(out.vocabulary.id2token) for id in out2.char_ids) - - -# #file-path input & unknown characters fall back to -# mktemp() do path, io # ← path first, stream second -# write(io, "¿Qué?") -# close(io) - -# out3 = preprocess_for_char_embeddings(path; vocab = out.vocabulary) - -# # first character should be mapped to the unk id -# @test out3.char_ids[1] == out.vocabulary.unk_id -# end -# end - - -# @testset "preprocess_for_char_embeddings - additional coverage" begin - -# #clean_options + char_options flags -# txt = "Go :)" -# res = preprocess_for_char_embeddings(txt; -# clean_options = Dict( -# :case_transform => :lower, -# :do_remove_punctuation => true -# ), -# char_options = Dict(:keep_space => true)) - -# @test res.cleaned_text == "go" # no trailing blank after normalisation -# @test res.chars == ["g","o"] -# @test res.char_ids[1:2] == -# [res.vocabulary.token2id[c] for c in res.chars] # length 2 - - - -# # add_new = true really grows the vocabulary and updates counts -# base = preprocess_for_char_embeddings("abc") # fresh vocab -# orig_vocab = base.vocabulary -# orig_len = length(orig_vocab.id2token) - -# extra = preprocess_for_char_embeddings("abcx"; -# vocab = orig_vocab, -# id_options = Dict(:add_new => true)) - -# @test length(orig_vocab.id2token) == orig_len + 1 # x appended -# new_id = orig_vocab.token2id["x"] -# @test orig_vocab.counts[new_id] == 1 # counts updated -# @test extra.char_ids[end] == new_id - - - -# #min_freq filter in vocab_options -# txt_lowfreq = "aaab" # 'a' freq=3, 'b' freq=1 -# vfilt = preprocess_for_char_embeddings(txt_lowfreq; -# vocab_options = Dict(:min_freq => 2)) - -# @test haskey(vfilt.vocabulary.token2id, "a") # kept -# @test !haskey(vfilt.vocabulary.token2id, "b") # filtered out -# @test vfilt.char_ids[end] == vfilt.vocabulary.unk_id #'b' - - - -# #tmp-file cleanup guard -# tmp_path = "" -# mktemp() do path, io -# tmp_path = path -# write(io, "Test") -# close(io) -# preprocess_for_char_embeddings(path) # just exercise the call -# end -# @test !isfile(tmp_path) # mktemp has removed the file -# end - - -# @testset "preprocess_for_char_embeddings - edge cases" begin - -# #mixed accents + uppercase transform + accent-stripping -# txt_acc = "Áaá\n" -# out_acc = preprocess_for_char_embeddings(txt_acc; -# clean_options = Dict( -# :case_transform => :upper, -# :do_remove_accents => true, -# :unicode_normalize => true -# )) - -# expected_clean = "AAA" # three letters, newline removed -# expected_chars = ["A","A","A"] - -# @test out_acc.cleaned_text == expected_clean -# @test out_acc.chars == expected_chars -# @test out_acc.char_ids == [out_acc.vocabulary.token2id["A"] for _ in 1:3] - - - -# #user-supplied special tokens (deduplicated, kept up-front) - -# res_spec = preprocess_for_char_embeddings("xy"; -# vocab_options = Dict(:special_tokens => ["", ""])) - -# @test res_spec.vocabulary.id2token[1:2] == ["", ""] -# # no duplicates even if we supply again -# res_spec2 = preprocess_for_char_embeddings("xy"; -# vocab_options = Dict(:special_tokens => ["", ""])) -# @test res_spec2.vocabulary.id2token[1] == "" && -# length(res_spec2.vocabulary.id2token[1:2]) == 2 # some non- token occupies slot 2 - - - - -# #update_counts = false leaves counts unchanged - -# base2 = preprocess_for_char_embeddings("zzz") -# vocab2 = base2.vocabulary -# counts0 = deepcopy(vocab2.counts) - -# preprocess_for_char_embeddings("zzz"; -# vocab = vocab2, -# id_options = Dict(:update_counts => false)) - -# @test vocab2.counts == counts0 # nothing incremented - - - -# #unknown character mapped to when add_new = false -# res_unk = preprocess_for_char_embeddings("§"; vocab = vocab2, -# id_options = Dict(:add_new => false)) -# @test res_unk.char_ids[1] == vocab2.unk_id - - -# #large min_freq: all rarities - -# big_txt = "abcabcabcXYZ" # X,Y,Z each frequency 1 -# res_freq = preprocess_for_char_embeddings(big_txt; -# vocab_options = Dict(:min_freq => 3)) - -# @test !haskey(res_freq.vocabulary.token2id, "X") -# @test res_freq.char_ids[end] == res_freq.vocabulary.unk_id -# end - - -# @testset "preprocess_for_char_embeddings - large corpus smoke-test" begin -# #build a approx 250 kB synthetic corpus and persist it - -# sentence = "The quick brown fox jumps over the lazy dog. " -# big_text = repeat(sentence, 5000) #225 kB - -# path, io = mktemp() # path first, io second -# write(io, big_text) -# close(io) - -# #run the preprocessing pipeline on that file - -# res = preprocess_for_char_embeddings( -# path; -# clean_options = Dict(:case_transform => :lower), -# vocab_options = Dict(:min_freq => 100) -# ) - - -# #integrity checks -# #cleaned text is non-empty and far shorter than raw only because -# # spaces were collapsed - not because the file vanished -# @test length(res.cleaned_text) > 100_000 - -# #consistent lengths: ids == chars -# @test length(res.char_ids) == length(res.chars) - -# #vocabulary should contain more than just -# @test length(res.vocabulary.id2token) > 10 - -# #unseen glyph maps to -# unk_res = preprocess_for_char_embeddings("§"; vocab=res.vocabulary, -# id_options=Dict(:add_new=>false)) -# @test unk_res.char_ids[1] == res.vocabulary.unk_id - - -# #slice into windows and sanity-check -# function windowify(ids::Vector{Int}, win::Int, stride::Int) -# [ids[i:i+win-1] for i in 1:stride:length(ids)-win+1] -# end -# windows = windowify(res.char_ids, 128, 64) -# @test !isempty(windows) -# @test all(length(w) == 128 for w in windows) - -# #clean up temp-file -# rm(path; force=true) -# @test !isfile(path) -# end - - -# @testset "preprocess_for_char_embeddings - real text download" begin -# #download Alice's Adventures in Wonderland (150 kB) - -# url = "https://www.gutenberg.org/cache/epub/11/pg11.txt" -# path = tempname() * ".txt" - -# try -# Downloads.download(url, path) -# catch e -# @info "Network unavailable - skipping download test" exception = e -# return # skip the entire test-set -# end - -# #preprocess the downloaded file -# out = preprocess_for_char_embeddings( -# path; -# clean_options = Dict(:case_transform => :lower), -# vocab_options = Dict(:min_freq => 10) # keep common chars -# ) - - -# #logic checks -# @test length(out.cleaned_text) > 100_000 -# @test length(out.chars) == length(Unicode.graphemes(out.cleaned_text)) -# @test all(c in keys(out.vocabulary.token2id) for c in ["a","e","t"]) - -# res_unk = preprocess_for_char_embeddings("§"; -# vocab = out.vocabulary, -# id_options = Dict(:add_new => false)) -# @test res_unk.char_ids[1] == out.vocabulary.unk_id # 3d - -# #clean up -# rm(path; force = true) -# @test !isfile(path) -# end - - -# @testset "preprocess_for_char_embeddings - option matrix" begin -# #cleaning / whitespace / punctuation / accent flags -# raw = "Café \t\n🚀!! " # accents, repeated blanks, emoji, punct - -# clean_opts = Dict( -# :case_transform => :lower, -# :do_remove_punctuation => true, -# :do_remove_accents => true, -# :collapse_whitespace => true, # turn runs of blanks -> one space -# ) -# char_opts = Dict(:keep_space => false) # drop the single space we kept -# outA = preprocess_for_char_embeddings(raw; -# from_file = false, -# clean_options = clean_opts, -# char_options = char_opts) - -# @test outA.cleaned_text == "cafe 🚀" # accent stripped, blanks to 1, punct gone -# @test outA.chars == ["c","a","f","e","🚀"] # no space token -# @test length(outA.char_ids) == 5 - -# #space-keeping + Unicode-normalisation left intact -# raw2 = "Fiancée " * "\u202F" * "Ωmega" # NARROW NBSP between words -# outB = preprocess_for_char_embeddings(raw2; -# clean_options = Dict(:unicode_normalize => true), # NFC default -# char_options = Dict(:keep_space => true)) - -# @test " " in outB.chars # space token kept -# @test occursin("fiancée", lowercase(outB.cleaned_text)) # NFC preserved é - -# #external vocabulary + add_new / update_counts flags -# # make a tiny vocab with , a, b -# tok2id = Dict(""=>1, "a"=>2, "b"=>3) -# id2tok = ["","a","b"] -# extvoc = TextSpace.Preprocessing.Vocabulary(tok2id, id2tok, Dict{Int,Int}(), 1) - -# txtC = "abx" # 'x' is OOV -# outC1 = preprocess_for_char_embeddings(txtC; -# vocab = extvoc, -# char_options=Dict(:keep_space=>false), -# id_options = Dict(:add_new=>false, :update_counts=>false)) - -# @test outC1.char_ids == [2,3,1] # x -> unk_id -# @test !haskey(extvoc.token2id, "x") # vocab unchanged - -# # same text, but now allow growth and counting -# outC2 = preprocess_for_char_embeddings(txtC; -# vocab = extvoc, -# id_options = Dict(:add_new=>true, :update_counts=>true)) - -# @test extvoc.token2id["x"] == length(extvoc.id2token) # new token inserted -# @test outC2.char_ids[end] == extvoc.token2id["x"] -# @test extvoc.counts[ extvoc.token2id["x"] ] == 1 # counts updated - -# #file-path input (temp file) + ensure_unk! auto-repairs -# mktemp() do path, io -# write(io, "§") # char not in extvoc -# close(io) - -# broken = TextSpace.Preprocessing.Vocabulary(Dict("a"=>1), ["a"], Dict{Int,Int}(), 0) -# outD = preprocess_for_char_embeddings(path; -# from_file = true, -# vocab = broken, # will create *new* vocab -# id_options= Dict(:add_new=>false)) - -# #the pipeline returns a *new* repaired vocabulary -# @test outD.vocabulary !== broken -# @test outD.vocabulary.unk_id >= 1 -# @test outD.char_ids[1] == outD.vocabulary.unk_id - - -# rm(path; force=true) -# end -# end - - -# @testset "preprocess_for_char_embeddings - full option sweep" begin -# #cleaning + whitespace + accent/punct/emoji removal -# raw = "Café \t\n🚀!! — Ωmega🙂" - -# clean_opts = Dict( -# :case_transform => :lower, -# :do_remove_punctuation => true, -# :do_remove_symbols => true, -# :do_remove_emojis => true, -# :do_remove_accents => true, -# :collapse_whitespace => true, # collapse runs - single space -# ) -# char_opts = Dict(:keep_space => false) -# outA = preprocess_for_char_embeddings(raw; -# clean_options = clean_opts, -# char_options = char_opts, -# from_file = false) - -# @test occursin("cafe ωmega", outA.cleaned_text) # accent stripped, lower-cased -# @test !occursin('🚀', outA.cleaned_text) && !occursin('🙂', outA.cleaned_text) -# @test !occursin('—', outA.cleaned_text) # em-dash removed by punctuation flag -# @test " " ∉ outA.chars # because keep_space=false -# @test length(outA.char_ids) == length(outA.chars) - -# #keep_space = true + NFC normalisation only -# raw2 = "Fiancée " * "\u202F" * "Ωmega" # NARROW NBSP between words -# outB = preprocess_for_char_embeddings(raw2; -# clean_options = Dict(:unicode_normalize => true, :case_transform=>:lower), -# char_options = Dict(:keep_space => true), -# from_file = false) - -# @test " " in outB.chars -# @test occursin("fiancée ωmega", outB.cleaned_text) - -# #external vocabulary + add_new / update_counts -# tok2id = Dict(""=>1, "a"=>2, "b"=>3) -# id2tok = ["","a","b"] -# extvoc = TextSpace.Preprocessing.Vocabulary(tok2id, id2tok, Dict{Int,Int}(), 1) - -# # add_new=false keeps OOV as unk -# r1 = preprocess_for_char_embeddings("abx"; -# vocab = extvoc, -# id_options = Dict(:add_new=>false, :update_counts=>false), -# char_options = Dict(:keep_space=>false)) -# @test r1.char_ids == [2,3,1] -# @test !haskey(extvoc.token2id, "x") - -# # add_new=true extends vocab and updates counts -# r2 = preprocess_for_char_embeddings("abx"; -# vocab = extvoc, -# id_options = Dict(:add_new=>true, :update_counts=>true)) -# new_id = extvoc.token2id["x"] -# @test r2.char_ids[end] == new_id -# @test extvoc.counts[new_id] == 1 - -# #file input + ensure_unk! auto-repair -# mktemp() do path, io -# write(io, "§"); close(io) - -# broken = TextSpace.Preprocessing.Vocabulary(Dict("a"=>1), ["a"], -# Dict{Int,Int}(), 0) # unk_id = 0 - -# outD = preprocess_for_char_embeddings(path; -# from_file = true, -# vocab = broken, -# id_options = Dict(:add_new=>false)) - -# @test outD.vocabulary !== broken # got a repaired copy -# @test outD.vocabulary.unk_id ≥ 1 -# @test outD.char_ids[1] == outD.vocabulary.unk_id - -# rm(path; force=true) -# end - -# #min_freq filtering -# r5 = preprocess_for_char_embeddings("xxxyyZ"; -# vocab_options = Dict(:min_freq=>2)) -# @test !haskey(r5.vocabulary.token2id, "Z") -# @test r5.char_ids[end] == r5.vocabulary.unk_id -# end - - -# @testset "preprocess_for_char_embeddings - curated UTF-8 hammer" begin -# #paragraph: 8 sentences with new-lines, tabs, ZWSP, emoji, -# # combining marks, bidi controls, ligatures, NBSP, narrow NBSP, -# # and a zero-width joiner sequence -# zwsp = "\u200B" # ZERO-WIDTH SPACE -# nbsp = "\u00A0" # NBSP -# nnbsp = "\u202F" # NARROW NBSP -# rle = "\u202B" # RTL EMBEDDING -# pdf = "\u202C" # POP DIR. FORMAT -# ligfi = "fi" -# combé = "e\u0301" # e + COMBINING ACUTE -# famemo = "👨‍👩‍👧‍👦" # family emoji with ZWJ -# astro = "👩🏽‍🚀" -# para = """ -# Once upon a time,\tthere were two cafés.$(nbsp)$(nbsp) -# They said: “$(ligfi)\u200Breflies?! No way!”\n -# Meanwhile, 数学 is fun; $nnbsp but $(rle)مرحبا بالعالم$(pdf) was written backwards. -# Tabs, spaces,\n\nnew-lines, and $zwsp zero-widths $zwsp hide! $astro went to the 🌖. -# $famemo danced in the night… $(combé)! -# """ - -# #conservative cleaning (NFC only) + keep spaces -# outA = preprocess_for_char_embeddings(para; -# clean_options = Dict(:unicode_normalize=>true), -# char_options = Dict(:keep_space=>true), -# from_file = false) - -# @test occursin("café", outA.cleaned_text) # accent preserved -# @test '🌖' in outA.cleaned_text -# @test '\n' ∉ outA.cleaned_text # normalize_whitespace default -# @test " " in outA.chars # spaces kept -# @test length(outA.chars) == length(Unicode.graphemes(outA.cleaned_text)) - -# #aggressive emoji + punctuation + accent removal, collapse whitespace, drop spaces -# outB = preprocess_for_char_embeddings(para; -# clean_options = Dict( -# :do_remove_emojis => true, -# :do_remove_punctuation => true, -# :do_remove_accents => true, -# :collapse_whitespace => true, -# :case_transform => :lower), -# char_options = Dict(:keep_space=>false), -# from_file = false) - -# @test !occursin('🌖', outB.cleaned_text) && !occursin('👨', outB.cleaned_text) -# @test !occursin("¡", outB.cleaned_text) # punctuation gone -# @test !occursin("é", outB.cleaned_text) # accent stripped -# @test !occursin(r"\s\s", outB.cleaned_text) # no double blanks -# @test " " ∉ outB.chars # spaces dropped -# @test outB.cleaned_text == lowercase(outB.cleaned_text) - -# #symbols removed but punctuation kept; case upper -# outC = preprocess_for_char_embeddings(para; -# clean_options = Dict( -# :do_remove_symbols => true, # removes currency, math, emoji -# :do_remove_emojis => false, # but we already stripped symbols -# :case_transform => :upper), -# char_options = Dict(:keep_space=>true), -# from_file = false) - -# @test occursin("CAFÉ", outC.cleaned_text) -# @test '🌖' ∉ outC.cleaned_text # symbol removed -# @test 'É' ∈ outC.cleaned_text # accent still there -# @test any(c -> isuppercase(c[1]), outC.chars) # uppercase present - -# #high min_freq filters rare glyphs; ensure OOV-> -# rare_opts = Dict(:min_freq => 10) -# rD = preprocess_for_char_embeddings(para; -# vocab_options = rare_opts, -# char_options = Dict(:keep_space=>false), -# from_file = false) - -# #rare glyph '🌖' should NOT be in the pruned vocabulary -# @test !haskey(rD.vocabulary.token2id, "🌖") - -# #every occurrence in the corpus must therefore map to -# @test all(id == rD.vocabulary.unk_id -# for (tok,id) in zip(rD.chars, rD.char_ids) if tok == "🌖") - -# end diff --git a/test/preprocessing/preprocessing_char_tests.jl b/test/preprocessing/preprocessing_char_tests.jl index 5fdb74f..e5a7f9f 100644 --- a/test/preprocessing/preprocessing_char_tests.jl +++ b/test/preprocessing/preprocessing_char_tests.jl @@ -1,6 +1,3 @@ -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "CharProcessing.jl")) - - @testset "tokenize_char" begin txt = "Café 😊" diff --git a/test/preprocessing/preprocessing_cleantext_tests.jl b/test/preprocessing/preprocessing_cleantext_tests.jl index 600b4d5..57303e6 100644 --- a/test/preprocessing/preprocessing_cleantext_tests.jl +++ b/test/preprocessing/preprocessing_cleantext_tests.jl @@ -1,8 +1,3 @@ -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "CleanText.jl")) - -using Random, Unicode - - @testset "remove_punctuation" begin txt = "Hello, world! (yes) - test." @test remove_punctuation(txt) == "Hello world yes test" diff --git a/test/preprocessing/preprocessing_paragraph_tests.jl b/test/preprocessing/preprocessing_paragraph_tests.jl index 4a9a6cd..6c405a7 100644 --- a/test/preprocessing/preprocessing_paragraph_tests.jl +++ b/test/preprocessing/preprocessing_paragraph_tests.jl @@ -1,6 +1,4 @@ -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "ParagraphProcessing.jl")) - -const PP = TextSpace.Preprocessing +const PP = TextSpace.Plumbing @testset "unwrap_lines" begin diff --git a/test/preprocessing/preprocessing_sentence_tests.jl b/test/preprocessing/preprocessing_sentence_tests.jl index 31eee33..d7ae907 100644 --- a/test/preprocessing/preprocessing_sentence_tests.jl +++ b/test/preprocessing/preprocessing_sentence_tests.jl @@ -1,6 +1,3 @@ -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "SentenceProcessing.jl")) - - @testset "split_sentences" begin txt = "Dr. Smith went to Washington. It was rainy! Was it fun? Yes." sents = split_sentences(txt) diff --git a/test/preprocessing/preprocessing_subword_pipeline_tests.jl b/test/preprocessing/preprocessing_subword_pipeline_tests.jl index 69927b4..3013b66 100644 --- a/test/preprocessing/preprocessing_subword_pipeline_tests.jl +++ b/test/preprocessing/preprocessing_subword_pipeline_tests.jl @@ -8,7 +8,3 @@ - - - - diff --git a/test/preprocessing/preprocessing_test_gateway.jl b/test/preprocessing/preprocessing_test_gateway.jl deleted file mode 100644 index c0cb59f..0000000 --- a/test/preprocessing/preprocessing_test_gateway.jl +++ /dev/null @@ -1,38 +0,0 @@ - -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "Tokenization.jl")) -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "CharProcessing.jl")) -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "TextNormalization.jl")) -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "SentenceProcessing.jl")) -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "ParagraphProcessing.jl")) - - - -#test paragraph processing -include("preprocessing_paragraph_tests.jl") - - -#test sentence processing -include("preprocessing_sentence_tests.jl") - - -#test char preprocessing -include("preprocessing_char_tests.jl") - - -#test the text tokenization -include("preprocessing_tokenization_tests.jl") - - -#test the textnormalization -include("preprocessing_textnormalization_tests.jl") - - -#test clean text -include("preprocessing_cleantext_tests.jl") - - - - - - - diff --git a/test/preprocessing/preprocessing_textnormalization_tests.jl b/test/preprocessing/preprocessing_textnormalization_tests.jl index cce8c81..517cd9f 100644 --- a/test/preprocessing/preprocessing_textnormalization_tests.jl +++ b/test/preprocessing/preprocessing_textnormalization_tests.jl @@ -1,6 +1,3 @@ -include(joinpath(@__DIR__, "..", "..", "src", "preprocessing", "TextNormalization.jl")) - - @testset "normalize_unicode" begin decomposed = "Cafe\u0301" # "Café" (e + COMBINING ACUTE) composed = "Café" # NFC form diff --git a/test/preprocessing/preprocessing_tokenization_tests.jl b/test/preprocessing/preprocessing_tokenization_tests.jl index 0303547..87ddf59 100644 --- a/test/preprocessing/preprocessing_tokenization_tests.jl +++ b/test/preprocessing/preprocessing_tokenization_tests.jl @@ -1,5 +1,3 @@ - - @testset "basic_tokenize" begin txt = "Hello, World!\n" diff --git a/test/runtests.jl b/test/runtests.jl index 11bcb62..b9efa1a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,25 +1,19 @@ -using TextSpace using Test - +using TextSpace using Random +using Unicode using Downloads -# include(joinpath(@__DIR__, "..", "src", "preprocessing", "Vocabulary.jl")) -# include(joinpath(@__DIR__, "..", "src", "preprocessing", "SubwordProcessing.jl")) - -# include("SubwordEmbeddings/subword_embeddings_test_gateway.jl") -# include("WordEmbeddings/word_embeddings_test_gateway.jl") -# include("CharacterEmbeddings/character_embeddings_test_gateway.jl") -# include("preprocessing/preprocessing_test_gateway.jl") - -# include("util-tests/__init__.jl") -include("pipeline/preprocessing_pipeline_tests.jl") - +@testset "TextSpace.jl Test Suite" begin + @testset "Plumbing" begin + include("preprocessing/__init__.jl") # Loads all preprocessing tests + end + @testset "Pipelines" begin + include("pipeline/__init__.jl") # Now loads all pipeline tests uniformly + end -@testset "basic root test" begin - # Test 1: Default behavior (no punctuation or emoji removal) - text1 = "Hello, World!" - #only lowercasing and whitespace normalization occur. - @test text1 == "Hello, World!" -end + @testset "Basic Tests" begin + @test true # Your basic smoke tests + end +end \ No newline at end of file