Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 202 additions & 77 deletions Manifest.toml

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand All @@ -23,6 +24,9 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
BytePairEncoding = "0.5.2"
Revise = "3.8.0"
TextEncodeBase = "0.8.3"
Unicode = ">=1.10.8, <1.12"
julia = ">=1.9"

Expand Down
3 changes: 2 additions & 1 deletion src/TextSpace.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ resource(name) = joinpath(@__DIR__, "resources", name) # e.g. "gpt2_merges.txt"
include("preprocessing/__init__.jl")
include("utils/__init__.jl")


include(joinpath(@__DIR__, "pipeline", "Pipeline.jl"))
# #now use
@reexport using .Plumbing
@reexport using .Pipeline

# high-level embeddings
# include(joinpath(@__DIR__, "embeddings", "CharacterEmbeddings.jl"))
# @reexport using .CharacterEmbeddings
Expand Down
16 changes: 5 additions & 11 deletions src/preprocessing/CleanText.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,12 @@ Strip all combining diacritical marks (Unicode category *Mn*) from `text`
while leaving base characters intact. Works on Julia 1.6 - 1.11.
"""
function remove_accents(text::AbstractString)::String
# normalize to NFD to separate characters from their accents
nfd = Unicode.normalize(text, :NFD)

if isdefined(Unicode, :combining_class) # >= 1.10 fast path
io = IOBuffer()
@inbounds for c in nfd
Unicode.combining_class(c) == 0 && write(io, c)
end
return Unicode.normalize(String(take!(io)), :NFC)
else # 1.6 - 1.9 fallback
stripped = replace(nfd, r"\p{Mn}" => "")
return Unicode.normalize(stripped, :NFC)
end
# remove all combining diacritical marks using regex
stripped = replace(nfd, r"\p{Mn}" => "")
# normalize back to NFC for canonical representation
return Unicode.normalize(stripped, :NFC)
end


Expand Down
19 changes: 10 additions & 9 deletions src/preprocessing/__init__.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
module Plumbing


include("CleanText.jl")
include("TextNormalization.jl")
include("Tokenization.jl")
include("CharProcessing.jl")
include("SentenceProcessing.jl")
include("ParagraphProcessing.jl")


export clean_text, strip_zero_width, normalize_whitespace,
tokenize, tokenize_batch,
tokenize_char,
split_sentences,
split_paragraphs,
filter_paragraphs

end
remove_punctuation, remove_emojis, remove_accents,
tokenize, tokenize_batch, unwrap_lines,
tokenize_char, char_tokens,
split_sentences,
split_paragraphs,
filter_paragraphs, normalize_unicode, paragraph_windows,
merge_short_paragraphs, _is_blank_paragraph, drop_empty_paragraph,
strip_outer_quotes, SlidingSentenceWindow,
basic_tokenize, strip_punctuation, ngrams, WHITESPACE_REGEX
end
2 changes: 2 additions & 0 deletions test/pipeline/__init__.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# load all pipeline test files
include("preprocessing_pipeline_tests.jl")
8 changes: 8 additions & 0 deletions test/preprocessing/__init__.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# test/preprocessing/__init__.jl
include("preprocessing_cleantext_tests.jl")
include("preprocessing_textnormalization_tests.jl")
include("preprocessing_tokenization_tests.jl")
include("preprocessing_char_tests.jl")
include("preprocessing_sentence_tests.jl")
include("preprocessing_paragraph_tests.jl")
include("preprocessing_subword_pipeline_tests.jl")
Loading