diff --git a/README.md b/README.md index b4e0eaa..df722e5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![CI](https://github.com/cardmagic/classifier/actions/workflows/ruby.yml/badge.svg)](https://github.com/cardmagic/classifier/actions/workflows/ruby.yml) [![License: LGPL](https://img.shields.io/badge/License-LGPL_2.1-blue.svg)](https://opensource.org/licenses/LGPL-2.1) -A Ruby library for text classification using Bayesian, LSI (Latent Semantic Indexing), and k-Nearest Neighbors (kNN) algorithms. +A Ruby library for text classification using Bayesian, LSI (Latent Semantic Indexing), k-Nearest Neighbors (kNN), and TF-IDF algorithms. **[Documentation](https://rubyclassifier.com/docs)** · **[Tutorials](https://rubyclassifier.com/docs/tutorials)** · **[Guides](https://rubyclassifier.com/docs/guides)** @@ -14,6 +14,7 @@ A Ruby library for text classification using Bayesian, LSI (Latent Semantic Inde - [Bayesian Classifier](#bayesian-classifier) - [LSI (Latent Semantic Indexing)](#lsi-latent-semantic-indexing) - [k-Nearest Neighbors (kNN)](#k-nearest-neighbors-knn) +- [TF-IDF Vectorizer](#tf-idf-vectorizer) - [Persistence](#persistence) - [Performance](#performance) - [Development](#development) @@ -256,6 +257,77 @@ knn.categories **Why the size difference?** Bayes stores aggregate statistics—adding 10,000 documents just increments counters. kNN stores every example and compares against all of them during classification, so performance degrades with size. +## TF-IDF Vectorizer + +Transform text documents into TF-IDF (Term Frequency-Inverse Document Frequency) weighted feature vectors. TF-IDF downweights common words and upweights discriminative terms—the foundation for most classic text classification approaches. + +### Quick Start + +```ruby +require 'classifier' + +tfidf = Classifier::TFIDF.new +tfidf.fit(["Dogs are great pets", "Cats are independent", "Birds can fly"]) + +# Transform text to TF-IDF vector (L2 normalized) +vector = tfidf.transform("Dogs are loyal") +# => {:dog=>0.7071..., :loyal=>0.7071...} + +# Fit and transform in one step +vectors = tfidf.fit_transform(documents) +``` + +### Options + +```ruby +tfidf = Classifier::TFIDF.new( + min_df: 2, # Minimum document frequency (Integer or Float 0.0-1.0) + max_df: 0.95, # Maximum document frequency (filters very common terms) + ngram_range: [1, 2], # Extract unigrams and bigrams + sublinear_tf: true # Use 1 + log(tf) instead of raw term frequency +) +``` + +### Vocabulary Inspection + +```ruby +tfidf.fit(documents) + +tfidf.vocabulary # => {:dog=>0, :cat=>1, :bird=>2, ...} +tfidf.idf # => {:dog=>1.405, :cat=>1.405, ...} +tfidf.feature_names # => [:dog, :cat, :bird, ...] +tfidf.num_documents # => 3 +tfidf.fitted? # => true +``` + +### N-gram Support + +```ruby +# Extract bigrams only +tfidf = Classifier::TFIDF.new(ngram_range: [2, 2]) +tfidf.fit(["quick brown fox", "lazy brown dog"]) +tfidf.vocabulary.keys +# => [:quick_brown, :brown_fox, :lazi_brown, :brown_dog] + +# Unigrams through trigrams +tfidf = Classifier::TFIDF.new(ngram_range: [1, 3]) +``` + +### Serialization + +```ruby +# Save to JSON +json = tfidf.to_json +File.write("tfidf.json", json) + +# Load from JSON +loaded = Classifier::TFIDF.from_json(File.read("tfidf.json")) + +# Or use Marshal +data = Marshal.dump(tfidf) +loaded = Marshal.load(data) +``` + ## Persistence Save and load classifiers with pluggable storage backends. Works with Bayes, LSI, and kNN classifiers. diff --git a/lib/classifier.rb b/lib/classifier.rb index 81c9c90..1128590 100644 --- a/lib/classifier.rb +++ b/lib/classifier.rb @@ -32,3 +32,4 @@ require 'classifier/bayes' require 'classifier/lsi' require 'classifier/knn' +require 'classifier/tfidf' diff --git a/lib/classifier/errors.rb b/lib/classifier/errors.rb index e94d338..89fd6b0 100644 --- a/lib/classifier/errors.rb +++ b/lib/classifier/errors.rb @@ -13,4 +13,7 @@ class UnsavedChangesError < Error; end # Raised when a storage operation fails class StorageError < Error; end + + # Raised when using an unfitted model + class NotFittedError < Error; end end diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb new file mode 100644 index 0000000..f698476 --- /dev/null +++ b/lib/classifier/tfidf.rb @@ -0,0 +1,252 @@ +# rbs_inline: enabled + +# Author:: Lucas Carlson (mailto:lucas@rufy.com) +# Copyright:: Copyright (c) 2024 Lucas Carlson +# License:: LGPL + +require 'json' + +module Classifier + # TF-IDF vectorizer: transforms text to weighted feature vectors. + # Downweights common words, upweights discriminative terms. + # + # Example: + # tfidf = Classifier::TFIDF.new + # tfidf.fit(["Dogs are great pets", "Cats are independent"]) + # tfidf.transform("Dogs are loyal") # => {:dog=>0.7071..., :loyal=>0.7071...} + # + class TFIDF + # @rbs @min_df: Integer | Float + # @rbs @max_df: Integer | Float + # @rbs @ngram_range: Array[Integer] + # @rbs @sublinear_tf: bool + # @rbs @vocabulary: Hash[Symbol, Integer] + # @rbs @idf: Hash[Symbol, Float] + # @rbs @num_documents: Integer + # @rbs @fitted: bool + + attr_reader :vocabulary, :idf, :num_documents + + # Creates a new TF-IDF vectorizer. + # - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion) + # - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams + # - sublinear_tf: use 1 + log(tf) instead of raw term frequency + # + # @rbs (?min_df: Integer | Float, ?max_df: Integer | Float, + # ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void + def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false) + validate_df!(min_df, 'min_df') + validate_df!(max_df, 'max_df') + validate_ngram_range!(ngram_range) + + @min_df = min_df + @max_df = max_df + @ngram_range = ngram_range + @sublinear_tf = sublinear_tf + @vocabulary = {} + @idf = {} + @num_documents = 0 + @fitted = false + end + + # Learns vocabulary and IDF weights from the corpus. + # @rbs (Array[String]) -> self + def fit(documents) + raise ArgumentError, 'documents must be an array' unless documents.is_a?(Array) + raise ArgumentError, 'documents cannot be empty' if documents.empty? + + @num_documents = documents.size + document_frequencies = Hash.new(0) + + documents.each do |doc| + terms = extract_terms(doc) + terms.each_key { |term| document_frequencies[term] += 1 } + end + + @vocabulary = {} + @idf = {} + vocab_index = 0 + + document_frequencies.each do |term, df| + next unless within_df_bounds?(df, @num_documents) + + @vocabulary[term] = vocab_index + vocab_index += 1 + + # IDF: log((N + 1) / (df + 1)) + 1 + @idf[term] = Math.log((@num_documents + 1).to_f / (df + 1)) + 1 + end + + @fitted = true + self + end + + # Transforms a document into a normalized TF-IDF vector. + # @rbs (String) -> Hash[Symbol, Float] + def transform(document) + raise NotFittedError, 'TFIDF has not been fitted. Call fit first.' unless @fitted + + terms = extract_terms(document) + result = {} #: Hash[Symbol, Float] + + terms.each do |term, tf| + next unless @vocabulary.key?(term) + + tf_value = @sublinear_tf && tf.positive? ? 1 + Math.log(tf) : tf.to_f + result[term] = (tf_value * @idf[term]).to_f + end + + normalize_vector(result) + end + + # Fits and transforms in one step. + # @rbs (Array[String]) -> Array[Hash[Symbol, Float]] + def fit_transform(documents) + fit(documents) + documents.map { |doc| transform(doc) } + end + + # Returns vocabulary terms in index order. + # @rbs () -> Array[Symbol] + def feature_names + @vocabulary.keys.sort_by { |term| @vocabulary[term] } + end + + # @rbs () -> bool + def fitted? + @fitted + end + + # @rbs (?untyped) -> Hash[Symbol, untyped] + def as_json(_options = nil) + { + version: 1, + type: 'tfidf', + min_df: @min_df, + max_df: @max_df, + ngram_range: @ngram_range, + sublinear_tf: @sublinear_tf, + vocabulary: @vocabulary, + idf: @idf, + num_documents: @num_documents, + fitted: @fitted + } + end + + # @rbs (?untyped) -> String + def to_json(_options = nil) + JSON.generate(as_json) + end + + # Loads a vectorizer from JSON. + # @rbs (String | Hash[String, untyped]) -> TFIDF + def self.from_json(json) + data = json.is_a?(String) ? JSON.parse(json) : json + raise ArgumentError, "Invalid vectorizer type: #{data['type']}" unless data['type'] == 'tfidf' + + instance = new( + min_df: data['min_df'], + max_df: data['max_df'], + ngram_range: data['ngram_range'], + sublinear_tf: data['sublinear_tf'] + ) + + instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary'])) + instance.instance_variable_set(:@idf, symbolize_keys(data['idf'])) + instance.instance_variable_set(:@num_documents, data['num_documents']) + instance.instance_variable_set(:@fitted, data['fitted']) + + instance + end + + # @rbs () -> Array[untyped] + def marshal_dump + [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted] + end + + # @rbs (Array[untyped]) -> void + def marshal_load(data) + @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data + end + + private + + # @rbs (String) -> Hash[Symbol, Integer] + def extract_terms(document) + result = Hash.new(0) + + if @ngram_range[0] <= 1 + word_hash = document.clean_word_hash + word_hash.each { |term, count| result[term] += count } + end + + return result if @ngram_range[1] <= 1 + + tokens = tokenize_for_ngrams(document) + (2..@ngram_range[1]).each do |n| + next if n < @ngram_range[0] + + generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 } + end + + result + end + + # @rbs (String) -> Array[String] + def tokenize_for_ngrams(document) + document + .gsub(/[^\w\s]/, '') + .split + .map(&:downcase) + .reject { |w| w.length <= 2 || String::CORPUS_SKIP_WORDS.include?(w) } + .map(&:stem) + end + + # @rbs (Array[String], Integer) -> Array[Symbol] + def generate_ngrams(tokens, n) # rubocop:disable Naming/MethodParameterName + return [] if tokens.size < n + + tokens.each_cons(n).map { |gram| gram.join('_').intern } + end + + # @rbs (Integer, Integer) -> bool + def within_df_bounds?(doc_freq, num_docs) + doc_freq.between?( + @min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df, + @max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df + ) + end + + # @rbs (Hash[Symbol, Float]) -> Hash[Symbol, Float] + def normalize_vector(vector) + return vector if vector.empty? + + magnitude = Math.sqrt(vector.values.sum { |v| v * v }) + return vector if magnitude.zero? + + vector.transform_values { |v| v / magnitude } + end + + # @rbs (Integer | Float, String) -> void + def validate_df!(value, name) + raise ArgumentError, "#{name} must be an Integer or Float" unless value.is_a?(Float) || value.is_a?(Integer) + raise ArgumentError, "#{name} must be between 0.0 and 1.0" if value.is_a?(Float) && !value.between?(0.0, 1.0) + raise ArgumentError, "#{name} must be non-negative" if value.is_a?(Integer) && value.negative? + end + + # @rbs (Array[Integer]) -> void + def validate_ngram_range!(range) + raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2 + unless range.all?(Integer) && range.all?(&:positive?) + raise ArgumentError, 'ngram_range values must be positive integers' + end + raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1] + end + + # @rbs (Hash[String, untyped]) -> Hash[Symbol, untyped] + def self.symbolize_keys(hash) + hash.transform_keys(&:to_sym) + end + private_class_method :symbolize_keys + end +end diff --git a/test/tfidf/tfidf_test.rb b/test/tfidf/tfidf_test.rb new file mode 100644 index 0000000..fde00e2 --- /dev/null +++ b/test/tfidf/tfidf_test.rb @@ -0,0 +1,439 @@ +require_relative '../test_helper' + +class TFIDFTest < Minitest::Test + def setup + @doc1 = 'Dogs are great pets and very loyal' + @doc2 = 'Cats are independent and self-sufficient' + @doc3 = 'Birds can fly and sing beautiful songs' + @doc4 = 'Dogs and cats are popular pets' + @corpus = [@doc1, @doc2, @doc3, @doc4] + end + + # Initialization tests + + def test_default_initialization + tfidf = Classifier::TFIDF.new + + refute_predicate tfidf, :fitted? + assert_empty tfidf.vocabulary + assert_empty tfidf.idf + assert_equal 0, tfidf.num_documents + end + + def test_custom_min_df_integer + tfidf = Classifier::TFIDF.new(min_df: 2) + + tfidf.fit(@corpus) + + # Terms appearing in only 1 document should be excluded + tfidf.vocabulary.each_key do |term| + doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + + assert_operator doc_count, :>=, 2, "Term #{term} should appear in at least 2 documents" + end + end + + def test_custom_min_df_float + tfidf = Classifier::TFIDF.new(min_df: 0.5) + + tfidf.fit(@corpus) + + # Terms appearing in less than 50% of documents should be excluded + min_count = (@corpus.size * 0.5).ceil + tfidf.vocabulary.each_key do |term| + doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + + assert_operator doc_count, :>=, min_count + end + end + + def test_custom_max_df_integer + tfidf = Classifier::TFIDF.new(max_df: 2) + + tfidf.fit(@corpus) + + # Terms appearing in more than 2 documents should be excluded + tfidf.vocabulary.each_key do |term| + doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + + assert_operator doc_count, :<=, 2 + end + end + + def test_custom_max_df_float + tfidf = Classifier::TFIDF.new(max_df: 0.5) + + tfidf.fit(@corpus) + + # Terms appearing in more than 50% of documents should be excluded + max_count = (@corpus.size * 0.5).floor + tfidf.vocabulary.each_key do |term| + doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + + assert_operator doc_count, :<=, max_count + end + end + + def test_invalid_min_df_raises + assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: -1) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: 1.5) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: 'invalid') } + end + + def test_invalid_max_df_raises + assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: -1) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: 1.5) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: 'invalid') } + end + + def test_invalid_ngram_range_raises + assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [2, 1]) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [0, 1]) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [1]) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: 'invalid') } + end + + # Fit tests + + def test_fit_builds_vocabulary + tfidf = Classifier::TFIDF.new + + tfidf.fit(@corpus) + + assert_predicate tfidf, :fitted? + refute_empty tfidf.vocabulary + assert_equal @corpus.size, tfidf.num_documents + end + + def test_fit_computes_idf + tfidf = Classifier::TFIDF.new + + tfidf.fit(@corpus) + + refute_empty tfidf.idf + assert_equal tfidf.vocabulary.size, tfidf.idf.size + + # All IDF values should be positive + tfidf.idf.each_value do |idf_value| + assert_operator idf_value, :>, 0 + end + end + + def test_fit_idf_ordering + # Terms appearing in fewer documents should have higher IDF + docs = [ + 'apple banana cherry', + 'apple banana date', + 'apple elderberry fig' + ] + tfidf = Classifier::TFIDF.new + + tfidf.fit(docs) + + # 'appl' appears in all 3 docs, 'banana' in 2, others in 1 + # IDF should be: rare terms > common terms + assert_operator tfidf.idf[:elderberri], :>, tfidf.idf[:banana] + assert_operator tfidf.idf[:banana], :>, tfidf.idf[:appl] + end + + def test_fit_returns_self + tfidf = Classifier::TFIDF.new + + result = tfidf.fit(@corpus) + + assert_same tfidf, result + end + + def test_fit_with_empty_array_raises + tfidf = Classifier::TFIDF.new + + assert_raises(ArgumentError) { tfidf.fit([]) } + end + + def test_fit_with_non_array_raises + tfidf = Classifier::TFIDF.new + + assert_raises(ArgumentError) { tfidf.fit('not an array') } + end + + # Transform tests + + def test_transform_returns_tfidf_vector + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + vector = tfidf.transform('Dogs are loyal pets') + + assert_instance_of Hash, vector + refute_empty vector + vector.each_value { |v| assert_kind_of Float, v } + end + + def test_transform_before_fit_raises + tfidf = Classifier::TFIDF.new + + assert_raises(Classifier::NotFittedError) { tfidf.transform('Some text') } + end + + def test_transform_normalizes_vector + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + vector = tfidf.transform('Dogs are loyal pets') + + # L2 norm should be 1 (or close to it due to floating point) + magnitude = Math.sqrt(vector.values.sum { |v| v * v }) + + assert_in_delta 1.0, magnitude, 0.0001 + end + + def test_transform_unknown_terms_ignored + tfidf = Classifier::TFIDF.new + tfidf.fit(['apple banana', 'cherry date']) + + # 'xyz' is not in vocabulary + vector = tfidf.transform('apple xyz') + + refute vector.key?(:xyz) + assert vector.key?(:appl) + end + + def test_transform_empty_result_for_unknown_text + tfidf = Classifier::TFIDF.new + tfidf.fit(['apple banana', 'cherry date']) + + vector = tfidf.transform('xyz uvw') + + assert_empty vector + end + + # fit_transform tests + + def test_fit_transform + tfidf = Classifier::TFIDF.new + + vectors = tfidf.fit_transform(@corpus) + + assert_predicate tfidf, :fitted? + assert_equal @corpus.size, vectors.size + vectors.each { |v| assert_instance_of Hash, v } + end + + # Sublinear TF tests + + def test_sublinear_tf + # Create document with repeated term + doc_with_repeats = 'dog dog dog dog cat' + corpus = [doc_with_repeats, 'bird fish'] + + tfidf_linear = Classifier::TFIDF.new(sublinear_tf: false) + tfidf_sublinear = Classifier::TFIDF.new(sublinear_tf: true) + + tfidf_linear.fit(corpus) + tfidf_sublinear.fit(corpus) + + vec_linear = tfidf_linear.transform(doc_with_repeats) + vec_sublinear = tfidf_sublinear.transform(doc_with_repeats) + + # With sublinear TF, the ratio of dog to cat should be smaller + # because 1 + log(4) < 4 (relative to 1 + log(1) = 1) + ratio_linear = vec_linear[:dog] / vec_linear[:cat] + ratio_sublinear = vec_sublinear[:dog] / vec_sublinear[:cat] + + assert_operator ratio_sublinear, :<, ratio_linear + end + + # N-gram tests + + def test_bigrams + tfidf = Classifier::TFIDF.new(ngram_range: [1, 2]) + + tfidf.fit(['quick brown fox', 'lazy brown dog']) + + # Should have bigrams in vocabulary + bigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.include?('_') } + + refute_empty bigram_terms, 'Should have bigram terms' + end + + def test_bigrams_only + tfidf = Classifier::TFIDF.new(ngram_range: [2, 2]) + + tfidf.fit(['quick brown fox', 'lazy brown dog']) + + # Should only have bigrams (terms with underscore) + tfidf.vocabulary.each_key do |term| + assert_includes term.to_s, '_', "Term #{term} should be a bigram" + end + end + + def test_trigrams + tfidf = Classifier::TFIDF.new(ngram_range: [1, 3]) + + tfidf.fit(['quick brown fox jumps', 'lazy brown dog runs']) + + trigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.count('_') == 2 } + + refute_empty trigram_terms, 'Should have trigram terms' + end + + # feature_names tests + + def test_feature_names + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + names = tfidf.feature_names + + assert_instance_of Array, names + assert_equal tfidf.vocabulary.size, names.size + names.each { |n| assert_instance_of Symbol, n } + end + + # Serialization tests + + def test_as_json + tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true) + tfidf.fit(@corpus) + + data = tfidf.as_json + + assert_equal 1, data[:version] + assert_equal 'tfidf', data[:type] + assert_equal 2, data[:min_df] + assert data[:sublinear_tf] + assert data[:fitted] + refute_empty data[:vocabulary] + refute_empty data[:idf] + end + + def test_to_json + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + json = tfidf.to_json + data = JSON.parse(json) + + assert_equal 'tfidf', data['type'] + assert data['fitted'] + end + + def test_from_json_string + tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true) + tfidf.fit(@corpus) + + json = tfidf.to_json + loaded = Classifier::TFIDF.from_json(json) + + assert_predicate loaded, :fitted? + assert_equal tfidf.vocabulary.size, loaded.vocabulary.size + assert_equal tfidf.num_documents, loaded.num_documents + + # Transform should produce same results + original_vec = tfidf.transform('Dogs are great') + loaded_vec = loaded.transform('Dogs are great') + + assert_equal original_vec, loaded_vec + end + + def test_from_json_hash + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + hash = JSON.parse(tfidf.to_json) + loaded = Classifier::TFIDF.from_json(hash) + + assert_predicate loaded, :fitted? + assert_equal tfidf.vocabulary.size, loaded.vocabulary.size + end + + def test_from_json_invalid_type_raises + invalid_json = { version: 1, type: 'invalid' }.to_json + + assert_raises(ArgumentError) { Classifier::TFIDF.from_json(invalid_json) } + end + + # Marshal tests + + def test_marshal_dump_load + tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true) + tfidf.fit(@corpus) + + dumped = Marshal.dump(tfidf) + loaded = Marshal.load(dumped) + + assert_predicate loaded, :fitted? + assert_equal tfidf.vocabulary, loaded.vocabulary + assert_equal tfidf.idf, loaded.idf + + # Transform should produce same results + original_vec = tfidf.transform('Dogs are great') + loaded_vec = loaded.transform('Dogs are great') + + assert_equal original_vec, loaded_vec + end + + # Edge cases + + def test_single_document_corpus + tfidf = Classifier::TFIDF.new + + tfidf.fit(['Single document with words']) + + assert_predicate tfidf, :fitted? + refute_empty tfidf.vocabulary + end + + def test_document_with_only_stopwords + tfidf = Classifier::TFIDF.new + tfidf.fit(['the and or but', 'dog cat bird']) + + # Transform a document with only stopwords + vector = tfidf.transform('the and or but') + + assert_empty vector + end + + def test_repeated_fit_overwrites + tfidf = Classifier::TFIDF.new + + tfidf.fit(['apple banana']) + first_vocab = tfidf.vocabulary.dup + + tfidf.fit(['cherry date elderberry']) + + refute_equal first_vocab, tfidf.vocabulary + end + + def test_unicode_text + tfidf = Classifier::TFIDF.new + + tfidf.fit(['Caf manger boire', 'chteau jardin maison']) + vector = tfidf.transform('Caf jardin') + + refute_empty vector + end + + def test_very_long_document + long_doc = (['word'] * 1000).join(' ') + tfidf = Classifier::TFIDF.new + + tfidf.fit([long_doc, 'short document']) + vector = tfidf.transform(long_doc) + + refute_empty vector + # Should still be normalized + magnitude = Math.sqrt(vector.values.sum { |v| v * v }) + assert_in_delta 1.0, magnitude, 0.0001 unless vector.empty? + end + + def test_empty_document_in_corpus + # Empty strings should not cause issues + tfidf = Classifier::TFIDF.new + + tfidf.fit(['dog cat', '', 'bird fish']) + + assert_predicate tfidf, :fitted? + assert_equal 3, tfidf.num_documents + end +end