CNDB-13075: Reuse the analyzed tokens of the right operand of an analyzed expression (#1620)

adelapena · driftx · commit fc5bb815dad6 · 2025-06-13T08:32:26.000-05:00
Memoize the analyzed tokens of the right operand of an analyzed expression.
Also add some refactoring and simplifications around how analysis is dealt with
in `RowFilter` and `Operator`.
diff --git a/src/java/org/apache/cassandra/cql3/Operator.java b/src/java/org/apache/cassandra/cql3/Operator.java
diff --git a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
@@ -257,7 +257,7 @@ else if (otherValue == null)
                 // the condition value is not null, so only NEQ can return true
                 return operator == Operator.NEQ;
             }
-            return operator.isSatisfiedBy(type, otherValue, value, null, null); // We don't use any analyzers in LWT, see CNDB-11658
+            return operator.isSatisfiedBy(type, otherValue, value); // We don't use any analyzers in LWT, see CNDB-11658
         }
     }
 
diff --git a/src/java/org/apache/cassandra/cql3/conditions/ColumnConditions.java b/src/java/org/apache/cassandra/cql3/conditions/ColumnConditions.java
@@ -93,7 +93,7 @@ public Set<ColumnMetadata> getAnalyzedColumns(IndexRegistry indexRegistry)
 
         for (ColumnCondition condition : this)
         {
-            if (indexRegistry.getIndexAnalyzerFor(condition.column, condition.operator).isPresent())
+            if (indexRegistry.getAnalyzerFor(condition.column, condition.operator, null).isPresent())
             {
                 analyzedColumns.add(condition.column);
             }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
@@ -1225,7 +1225,7 @@ public void addToRowFilter(RowFilter.Builder filter, IndexRegistry indexRegistry
         {
             var index = findSupportingIndex(indexRegistry);
             var valueBytes = value.bindAndGet(options);
-            var terms = index.getQueryAnalyzer().get().analyze(valueBytes);
+            var terms = index.getAnalyzer(valueBytes).get().queriedTokens();
             if (terms.isEmpty())
                 throw invalidRequest("BM25 query must contain at least one term (perhaps your analyzer is discarding tokens you didn't expect)");
             filter.add(columnDef, Operator.BM25, valueBytes);
diff --git a/src/java/org/apache/cassandra/db/filter/RowFilter.java b/src/java/org/apache/cassandra/db/filter/RowFilter.java
diff --git a/src/java/org/apache/cassandra/index/Index.java b/src/java/org/apache/cassandra/index/Index.java
@@ -486,23 +486,12 @@ default RowFilter.CustomExpression customExpressionFor(TableMetadata metadata, B
     }
 
     /**
-     * Returns the write-time {@link Analyzer} for this index, if any. If the index doesn't transform the column values,
+     * Returns the {@link Analyzer} for this index, if any. If the index doesn't transform the column values,
      * this method will return an empty optional.
      *
-     * @return the write-time transforming column value analyzer for the index, if any
+     * @return the transforming column value analyzer for the index, if any
      */
-    default Optional<Analyzer> getIndexAnalyzer()
-    {
-        return Optional.empty();
-    }
-
-    /**
-     * Returns the query-time {@link Analyzer} for this index, if any. If the index doesn't transform the column values,
-     * this method will return an empty optional.
-     *
-     * @return the query-time transforming column value analyzer for the index, if any
-     */
-    default Optional<Analyzer> getQueryAnalyzer()
+    default Optional<Analyzer> getAnalyzer(ByteBuffer queriedValue)
     {
         return Optional.empty();
     }
@@ -514,10 +503,11 @@ default Optional<Analyzer> getQueryAnalyzer()
      * index. It can be used to perform the same transformation on values that the index does when indexing. That way,
      * the CQL operator can replicate the index behaviour when filtering results.
      */
-    @FunctionalInterface
     interface Analyzer
     {
-        List<ByteBuffer> analyze(ByteBuffer value);
+        List<ByteBuffer> indexedTokens(ByteBuffer indexedValue);
+
+        List<ByteBuffer> queriedTokens();
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/index/IndexRegistry.java b/src/java/org/apache/cassandra/index/IndexRegistry.java
@@ -20,6 +20,7 @@
  */
 package org.apache.cassandra.index;
 
+import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Optional;
@@ -315,25 +316,13 @@ default void registerIndex(Index index)
     Index getIndex(IndexMetadata indexMetadata);
     Collection<Index> listIndexes();
 
-    default Optional<Index.Analyzer> getIndexAnalyzerFor(ColumnMetadata column, Operator operator)
-    {
-        return getAnalyzerFor(column, operator, Index::getIndexAnalyzer);
-    }
-
-    default Optional<Index.Analyzer> getQueryAnalyzerFor(ColumnMetadata column, Operator operator)
-    {
-        return getAnalyzerFor(column, operator, Index::getQueryAnalyzer);
-    }
-
-    default Optional<Index.Analyzer> getAnalyzerFor(ColumnMetadata column,
-                                                    Operator operator,
-                                                    Function<Index, Optional<Index.Analyzer>> analyzerGetter)
+    default Optional<Index.Analyzer> getAnalyzerFor(ColumnMetadata column, Operator operator, ByteBuffer value)
     {
         for (Index index : listIndexes())
         {
             if (index.supportsExpression(column, operator))
             {
-                Optional<Index.Analyzer> analyzer = analyzerGetter.apply(index);
+                Optional<Index.Analyzer> analyzer = index.getAnalyzer(value);
                 if (analyzer.isPresent())
                     return analyzer;
             }
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
@@ -681,23 +681,34 @@ public AbstractType<?> customExpressionValueType()
     }
 
     @Override
-    public Optional<Analyzer> getIndexAnalyzer()
+    public Optional<Analyzer> getAnalyzer(ByteBuffer queriedValue)
     {
-        return indexContext.isAnalyzed()
-               ? Optional.of(value -> analyze(indexContext.getAnalyzerFactory(), value))
-               : Optional.empty();
-    }
+        if (!indexContext.isAnalyzed())
+            return Optional.empty();
 
-    @Override
-    public Optional<Analyzer> getQueryAnalyzer()
-    {
-        return indexContext.isAnalyzed()
-               ? Optional.of(value -> analyze(indexContext.getQueryAnalyzerFactory(), value))
-               : Optional.empty();
+        // memoize the analyzed queried value, so we don't have to re-analyze it for every evaluated column value
+        List<ByteBuffer> queriedTokens = analyze(indexContext.getQueryAnalyzerFactory(), queriedValue);
+
+        return Optional.of(new Analyzer() {
+            @Override
+            public List<ByteBuffer> indexedTokens(ByteBuffer value)
+            {
+                return analyze(indexContext.getAnalyzerFactory(), value);
+            }
+
+            @Override
+            public List<ByteBuffer> queriedTokens()
+            {
+                return queriedTokens;
+            }
+        });
     }
 
     private static List<ByteBuffer> analyze(AbstractAnalyzer.AnalyzerFactory factory, ByteBuffer value)
     {
+        if (value == null)
+            return null;
+
         List<ByteBuffer> tokens = new ArrayList<>();
         AbstractAnalyzer analyzer = factory.create();
         try
diff --git a/src/java/org/apache/cassandra/index/sai/plan/Operation.java b/src/java/org/apache/cassandra/index/sai/plan/Operation.java
@@ -310,8 +310,7 @@ static Node buildExpression(QueryController controller, RowFilter.Expression exp
                                                                                ListSerializer.readValue(expression.getIndexValue(),
                                                                                                         ByteBufferAccessor.instance,
                                                                                                         offset),
-                                                                               expression.indexAnalyzer(),
-                                                                               expression.queryAnalyzer(),
+                                                                               expression.analyzer(),
                                                                                expression.annOptions())));
                     offset += TypeSizes.INT_SIZE + ByteBufferAccessor.instance.getInt(expression.getIndexValue(), offset);
                 }
diff --git a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeyWithSortKey.java b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeyWithSortKey.java
@@ -69,7 +69,13 @@ public boolean isIndexDataValid(Row row, long nowInSecs)
         var cell = row.getCell(context.getDefinition());
         if (!cell.isLive(nowInSecs))
             return false;
-        assert cell instanceof CellWithSourceTable : "Expected CellWithSource, got " + cell.getClass();
+        // Check if the row is wrapped and if not, skip the source table check
+        if (!(cell instanceof CellWithSourceTable))
+        {
+            // If the cell is not wrapped, we can't validate the source table,
+            // so we just check if the index data matches the live data
+            return isIndexDataEqualToLiveData(cell.buffer());
+        }
         return sourceTable.equals(((CellWithSourceTable<?>) cell).sourceTable())
                && isIndexDataEqualToLiveData(cell.buffer());
     }
diff --git a/test/unit/org/apache/cassandra/cql3/OperatorTest.java b/test/unit/org/apache/cassandra/cql3/OperatorTest.java
@@ -19,6 +19,8 @@
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.List;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 
 import org.junit.Test;
@@ -36,7 +38,7 @@ public void testAnalyzer()
     {
         // test with a text-based case-insensitive analyzer
         UTF8Type utf8Type = UTF8Type.instance;
-        Index.Analyzer analyzer = value -> Collections.singletonList(utf8Type.decompose(utf8Type.compose(value).toUpperCase()));
+        Function<ByteBuffer, List<ByteBuffer>> analyzer = value -> Collections.singletonList(utf8Type.decompose(utf8Type.compose(value).toUpperCase()));
         testAnalyzer(utf8Type, utf8Type.decompose("FOO"), utf8Type.decompose("FOO"), analyzer, true);
         testAnalyzer(utf8Type, utf8Type.decompose("FOO"), utf8Type.decompose("foo"), analyzer, true);
         testAnalyzer(utf8Type, utf8Type.decompose("foo"), utf8Type.decompose("foo"), analyzer, true);
@@ -45,6 +47,7 @@ public void testAnalyzer()
 
         // test with an int-based analyzer that decomposes an integer into its digits
         Int32Type intType = Int32Type.instance;
+
         analyzer = value -> intType.compose(value)
                                    .toString()
                                    .chars()
@@ -63,48 +66,63 @@ public void testAnalyzer()
         testAnalyzer(utf8Type, intType.decompose(123), intType.decompose(1234), analyzer, false);
     }
 
+    private static Index.Analyzer analyzer(Function<ByteBuffer, List<ByteBuffer>> analyzer, ByteBuffer queriedValue)
+    {
+        return new Index.Analyzer()
+        {
+            @Override
+            public List<ByteBuffer> indexedTokens(ByteBuffer value)
+            {
+                return analyzer.apply(value);
+            }
+
+            @Override
+            public List<ByteBuffer> queriedTokens()
+            {
+                return analyzer.apply(queriedValue);
+            }
+        };
+    }
+
     private static void testAnalyzer(AbstractType<?> type,
-                                     ByteBuffer left,
-                                     ByteBuffer right,
-                                     Index.Analyzer analyzer,
+                                     ByteBuffer leftOperand,
+                                     ByteBuffer rightOperand,
+                                     Function<ByteBuffer, List<ByteBuffer>> analyzingFunction,
                                      boolean shouldBeSatisfied)
     {
+        // analyze the operands
+        Index.Analyzer analyzer = analyzer(analyzingFunction, rightOperand);
+        List<ByteBuffer> indexedTokens = analyzer.indexedTokens(leftOperand);
+        List<ByteBuffer> queriedTokens = analyzer.queriedTokens();
+
         // test that EQ and ANALYZER_MATCHES are satisfied by the same value with an analyzer
         for (Operator operator : Arrays.asList(Operator.EQ, Operator.ANALYZER_MATCHES))
-            Assertions.assertThat(operator.isSatisfiedBy(type, left, right, analyzer, analyzer)).isEqualTo(shouldBeSatisfied);
+            Assertions.assertThat(operator.isSatisfiedByAnalyzed(type, indexedTokens, queriedTokens)).isEqualTo(shouldBeSatisfied);
 
         // test that EQ without an analyzer behaves as type-based identity
-        Assertions.assertThat(Operator.EQ.isSatisfiedBy(type, left, right, null, null))
-                  .isEqualTo(type.compareForCQL(left, right) == 0);
+        Assertions.assertThat(Operator.EQ.isSatisfiedBy(type, leftOperand, rightOperand))
+                  .isEqualTo(type.compareForCQL(leftOperand, rightOperand) == 0);
 
         // test that ANALYZER_MATCHES throws an exception when no analyzer is provided
-        Assertions.assertThatThrownBy(() -> Operator.ANALYZER_MATCHES.isSatisfiedBy(type, left, right, null, null))
-                  .isInstanceOf(AssertionError.class)
+        Assertions.assertThatThrownBy(() -> Operator.ANALYZER_MATCHES.isSatisfiedBy(type, leftOperand, rightOperand))
+                  .isInstanceOf(UnsupportedOperationException.class)
                   .hasMessageContaining(": operation can only be computed by an indexed column with a configured analyzer");
 
-        // test that all other operators ignore the analyzer
+        // test that all other operators don't support the analyzer
         for (Operator operator : Operator.values())
         {
             if (operator == Operator.EQ || operator == Operator.ANALYZER_MATCHES)
                 continue;
 
-            boolean supported = false;
             try
             {
-                shouldBeSatisfied = operator.isSatisfiedBy(type, left, right, null, null);
-                supported = true;
+                operator.isSatisfiedBy(type, leftOperand, rightOperand);
             }
             catch (Exception e)
             {
-                Assertions.assertThatThrownBy(() -> operator.isSatisfiedBy(type, left, right, analyzer, analyzer))
-                          .isInstanceOf(e.getClass())
-                          .hasMessage(e.getMessage());
-            }
-
-            if (supported)
-            {
-                Assertions.assertThat(operator.isSatisfiedBy(type, left, right, analyzer, analyzer))
-                          .isEqualTo(shouldBeSatisfied);
+                Assertions.assertThatThrownBy(() -> operator.isSatisfiedByAnalyzed(type, indexedTokens, queriedTokens))
+                          .isInstanceOf(UnsupportedOperationException.class)
+                          .hasMessageContaining(operator + " operation does not support analyzers");
             }
         }
     }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java
@@ -1053,6 +1053,94 @@ public void testAnalyzerOnMapKeysWithDistinctQueryAnalyzer() throws Throwable
         });
     }
 
+    @Test
+    public void testAnalyzerOnMapValues() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, genres map<int, text>)");
+        execute("INSERT INTO %s (id, genres) VALUES ('1', {1: 'Horror', 2: 'comedy'})");
+
+        assertRowsNet(executeNet("SELECT id FROM %s WHERE genres CONTAINS 'Horror' ALLOW FILTERING"), row("1"));
+        assertRowsNet(executeNet("SELECT id FROM %s WHERE genres NOT CONTAINS 'Horror' ALLOW FILTERING"));
+        assertRowsNet(executeNet("SELECT id FROM %s WHERE genres CONTAINS 'horror' ALLOW FILTERING"));
+        assertRowsNet(executeNet("SELECT id FROM %s WHERE genres NOT CONTAINS 'horror' ALLOW FILTERING"), row("1"));
+
+        createIndex("CREATE CUSTOM INDEX ON %s(VALUES(genres)) USING 'StorageAttachedIndex' WITH OPTIONS = { 'index_analyzer':'STANDARD'}");
+
+        beforeAndAfterFlush(() -> {
+            assertRowsNet(executeNet("SELECT id FROM %s WHERE genres CONTAINS 'horror'"), row("1"));
+            assertRowsNet(executeNet("SELECT id FROM %s WHERE genres NOT CONTAINS 'horror'"));
+
+            // map comparisson with analyzer matches operator is not supported with or without filtering
+            Assertions.assertThatThrownBy(() -> execute("SELECT id FROM %s WHERE genres[1] : 'horror'"))
+                    .isInstanceOf(InvalidRequestException.class)
+                    .hasMessageContaining("can't be used with collections");
+            Assertions.assertThatThrownBy(() -> execute("SELECT id FROM %s WHERE genres[1] : 'horror' ALLOW FILTERING"))
+                    .isInstanceOf(InvalidRequestException.class)
+                    .hasMessageContaining("can't be used with collections");
+
+            // map comparison with eq operator is not supported by the index, and it's not analyzing when filtering
+            Assertions.assertThatThrownBy(() -> execute("SELECT id FROM %s WHERE genres[1] = 'horror'"))
+                    .isInstanceOf(InvalidRequestException.class)
+                    .hasMessageContaining("Column 'genres' has an index but does not support the operators specified in the query.");
+            assertRows(execute("SELECT id FROM %s WHERE genres[1] = 'horror' ALLOW FILTERING"));
+            assertRows(execute("SELECT id FROM %s WHERE genres[1] = 'Horror' ALLOW FILTERING"), row("1"));
+        });
+    }
+
+    @Test
+    public void testAnalyzerOnMapValuesWithDistinctQueryAnalyzer() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c int, v map<int, text>, PRIMARY KEY(k, c))");
+        createIndex("CREATE CUSTOM INDEX ON %s(VALUES(v)) USING 'StorageAttachedIndex' WITH OPTIONS = {" +
+                "'index_analyzer': '{" +
+                "  \"tokenizer\" : { \"name\" : \"whitespace\", \"args\" : {} }," +
+                "  \"filters\" : [ { \"name\" : \"lowercase\", \"args\": {} }, " +
+                "                  { \"name\" : \"edgengram\", \"args\": { \"minGramSize\":\"1\", \"maxGramSize\":\"30\" } }]," +
+                "  \"charFilters\" : []}', " +
+                "'query_analyzer': '{" +
+                "  \"tokenizer\" : { \"name\" : \"whitespace\", \"args\" : {} }," +
+                "  \"filters\" : [ {\"name\" : \"lowercase\",\"args\": {}} ]}'}");
+
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 1, {0: 'astra quick fox', 1: 'astra quick foxes', 2: 'astra4', 3: 'astra5 -1@a#', 4: 'lazy dog'})");
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 2, {0: 'astra quick fox'})");
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 3, {0: 'astra quick foxes'})");
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 4, {0: 'astra4'})");
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 5, {0: 'astra5 -1@a#'})");
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 6, {0: 'lazy dog'})");
+
+        beforeAndAfterFlush(() -> {
+
+            // contains
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v CONTAINS 'ast'"), row(1), row(2), row(3), row(4), row(5));
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v CONTAINS 'astra'"), row(1), row(2), row(3), row(4), row(5));
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v CONTAINS 'astra4'"), row(1), row(4));
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v CONTAINS 'astra5'"), row(1), row(5));
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v CONTAINS 'astra9'"));
+
+            // not contains
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v NOT CONTAINS 'ast'"), row(6));
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v NOT CONTAINS 'astra'"), row(6));
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v NOT CONTAINS 'astra4'"), row(2), row(3), row(5), row(6));
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v NOT CONTAINS 'astra5'"), row(2), row(3), row(4), row(6));
+            assertRowsNet(executeNet("SELECT c FROM %s WHERE v NOT CONTAINS 'astra9'"), row(1), row(2), row(3), row(4), row(5), row(6));
+
+            // map comparisson with analyzer matches operator is not supported with or without filtering
+            Assertions.assertThatThrownBy(() -> execute("SELECT c FROM %s WHERE v[0] : 'ast'"))
+                    .isInstanceOf(InvalidRequestException.class)
+                    .hasMessageContaining("can't be used with collections");
+            Assertions.assertThatThrownBy(() -> execute("SELECT c FROM %s WHERE v[0] : 'ast' ALLOW FILTERING"))
+                    .isInstanceOf(InvalidRequestException.class)
+                    .hasMessageContaining("can't be used with collections");
+
+            // map comparison with eq operator is not supported by the index, and it's not analyzing when filtering
+            Assertions.assertThatThrownBy(() -> execute("SELECT c FROM %s WHERE v[0] = 'ast'"))
+                    .isInstanceOf(InvalidRequestException.class)
+                    .hasMessageContaining("Column 'v' has an index but does not support the operators specified in the query.");
+            assertRows(execute("SELECT c FROM %s WHERE v[0] = 'ast' ALLOW FILTERING"));
+            assertRows(execute("SELECT c FROM %s WHERE v[0] = 'astra quick fox' ALLOW FILTERING"), row(1), row(2));
+        });
+    }
+
     private void assertClientWarningOnNGram(String indexOptions)
     {
         createIndexFromOptions(indexOptions);

Original file line number	Diff line number	Diff line change
`@@ -257,7 +257,7 @@ else if (otherValue == null)`
`257`	`257`	`// the condition value is not null, so only NEQ can return true`
`258`	`258`	`return operator == Operator.NEQ;`
`259`	`259`	`}`
`260`		`- return operator.isSatisfiedBy(type, otherValue, value, null, null); // We don't use any analyzers in LWT, see CNDB-11658`
	`260`	`+ return operator.isSatisfiedBy(type, otherValue, value); // We don't use any analyzers in LWT, see CNDB-11658`
`261`	`261`	`}`
`262`	`262`	`}`
`263`	`263`
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ public Set<ColumnMetadata> getAnalyzedColumns(IndexRegistry indexRegistry)`
`93`	`93`
`94`	`94`	`for (ColumnCondition condition : this)`
`95`	`95`	`{`
`96`		`- if (indexRegistry.getIndexAnalyzerFor(condition.column, condition.operator).isPresent())`
	`96`	`+ if (indexRegistry.getAnalyzerFor(condition.column, condition.operator, null).isPresent())`
`97`	`97`	`{`
`98`	`98`	`analyzedColumns.add(condition.column);`
`99`	`99`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1225,7 +1225,7 @@ public void addToRowFilter(RowFilter.Builder filter, IndexRegistry indexRegistry`
`1225`	`1225`	`{`
`1226`	`1226`	`var index = findSupportingIndex(indexRegistry);`
`1227`	`1227`	`var valueBytes = value.bindAndGet(options);`
`1228`		`- var terms = index.getQueryAnalyzer().get().analyze(valueBytes);`
	`1228`	`+ var terms = index.getAnalyzer(valueBytes).get().queriedTokens();`
`1229`	`1229`	`if (terms.isEmpty())`
`1230`	`1230`	`throw invalidRequest("BM25 query must contain at least one term (perhaps your analyzer is discarding tokens you didn't expect)");`
`1231`	`1231`	`filter.add(columnDef, Operator.BM25, valueBytes);`