Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
95ed224
first draft
alessandrobenedetti Mar 31, 2025
69dcae3
Only Nested Vectors changes
alessandrobenedetti Apr 2, 2025
19ac894
first tests draft, parent filter and children filter missing as a test
alessandrobenedetti Apr 4, 2025
20792a5
tests cleaned
alessandrobenedetti Apr 8, 2025
f209d38
code cleanup
alessandrobenedetti Apr 8, 2025
79c863b
draft documentation
alessandrobenedetti Apr 8, 2025
cba5473
tidy
alessandrobenedetti Apr 8, 2025
5f09496
tidy
alessandrobenedetti Apr 9, 2025
d40def2
first draft
alessandrobenedetti Mar 31, 2025
bb8cfde
Only Multi valued Vectors changes
alessandrobenedetti Apr 2, 2025
882eb9c
first draft
alessandrobenedetti Apr 9, 2025
993f085
first draft
alessandrobenedetti Apr 9, 2025
9713cc0
first draft
alessandrobenedetti Apr 9, 2025
a203a2d
first draft with working tests
alessandrobenedetti Apr 9, 2025
59e59ad
draft for automatic child transformer
alessandrobenedetti Apr 9, 2025
723b8b1
draft for automatic child transformer
alessandrobenedetti Apr 10, 2025
3693141
add best child per document transformer
alessandrobenedetti Apr 10, 2025
cdff4ad
add best child per document transformer
alessandrobenedetti Apr 10, 2025
88a0536
add best child per document transformer
alessandrobenedetti Apr 10, 2025
fcbf770
minor refinement to avoid some instructions
alessandrobenedetti Apr 16, 2025
cae962e
minor refactor
alessandrobenedetti Apr 16, 2025
4a7fdac
Update solr/core/src/java/org/apache/solr/search/join/BlockJoinParent…
alessandrobenedetti Apr 16, 2025
6a6aed8
Update solr/core/src/java/org/apache/solr/search/join/BlockJoinParent…
alessandrobenedetti Apr 16, 2025
b90ab87
Merge branch 'upstreamMain' into feature/SOLR-17736
alessandrobenedetti Dec 5, 2025
db232ec
new approach following feedback
alessandrobenedetti Dec 6, 2025
8c78b23
tests fixed for the new approach
alessandrobenedetti Dec 9, 2025
6e13aca
tidy + documentation
alessandrobenedetti Dec 9, 2025
a2aff99
tidy + documentation
alessandrobenedetti Dec 9, 2025
bac533e
tidy + documentation
alessandrobenedetti Dec 9, 2025
3b6f0c2
Merge branch 'upstreamMain' into feature/SOLR-17736
alessandrobenedetti Dec 10, 2025
de531d2
tidy + documentation
alessandrobenedetti Dec 10, 2025
1994007
Merge branch 'feature/SOLR-17736' into vectorMultivalued
alessandrobenedetti Dec 11, 2025
638c039
Merge branch 'upstreamMain' into vectorMultivalued
alessandrobenedetti Dec 11, 2025
227870b
catching up and merging
alessandrobenedetti Dec 11, 2025
b9865c1
work in progress
alessandrobenedetti Dec 17, 2025
6475c06
work in progress
alessandrobenedetti Dec 17, 2025
bcf9996
work in progress
alessandrobenedetti Dec 18, 2025
c19c33f
float multi valued vectors managed
alessandrobenedetti Dec 18, 2025
80cf944
first fully working draft, green tests
alessandrobenedetti Dec 19, 2025
b62033d
Merge branch 'upstreamMain' into vectorMultivalued
alessandrobenedetti Jan 15, 2026
c02d8ca
first fully working draft, green tests
alessandrobenedetti Jan 15, 2026
dfe966a
tidy and changelog
alessandrobenedetti Jan 15, 2026
d187eb3
first doc draft
alessandrobenedetti Jan 15, 2026
8203342
tests fixed
alessandrobenedetti Jan 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions changelog/unreleased/SOLR-18074.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
title: Introducing support for multi valued dense vector representation in documents through nested vectors
type: added # added, changed, fixed, deprecated, removed, dependency_update, security, other
authors:
- name: Alessandro Benedetti
links:
- name: SOLR-18074
url: https://issues.apache.org/jira/browse/SOLR-18074
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,13 @@
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
Expand All @@ -35,14 +39,17 @@
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.join.BitSetProducer;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.DenseVectorField;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.BitsFilteredPostingsEnum;
import org.apache.solr.search.DocIterationInfo;
import org.apache.solr.search.DocSet;
Expand Down Expand Up @@ -138,6 +145,20 @@ public void transform(SolrDocument rootDoc, int rootDocId, DocIterationInfo docI
final Bits liveDocs = leafReaderContext.reader().getLiveDocs();
final int segBaseId = leafReaderContext.docBase;
final int segRootId = rootDocId - segBaseId;
Set<String> multiValuedFLoatVectorFields =
this.getMultiValuedVectorFields(
searcher.getSchema(), childReturnFields, VectorEncoding.FLOAT32);
Set<String> multiValuedByteVectorFields =
this.getMultiValuedVectorFields(
searcher.getSchema(), childReturnFields, VectorEncoding.BYTE);
if ((multiValuedFLoatVectorFields.size() + multiValuedByteVectorFields.size()) > 0 &&
(multiValuedFLoatVectorFields.size() + multiValuedByteVectorFields.size())
!= childReturnFields.getExplicitlyRequestedFieldNames().size()) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"When using the Child transformer to flatten nested vectors, all 'fl' must be "
+ "multivalued vector fields");
}

// can return be -1 and that's okay (happens for very first block)
final int segPrevRootId;
Expand Down Expand Up @@ -219,8 +240,19 @@ public void transform(SolrDocument rootDoc, int rootDocId, DocIterationInfo docI

if (isAncestor) {
// if this path has pending child docs, add them.
addChildrenToParent(
doc, pendingParentPathsToChildren.remove(fullDocPath)); // no longer pending
if (!multiValuedFLoatVectorFields.isEmpty() || !multiValuedByteVectorFields.isEmpty()) {
addFlatMultiValuedFloatVectorsToParent(
rootDoc,
pendingParentPathsToChildren.values().iterator().next(),
multiValuedFLoatVectorFields);
addFlatMultiValuedByteVectorsToParent(
rootDoc,
pendingParentPathsToChildren.values().iterator().next(),
multiValuedByteVectorFields);
} else {
addChildrenToParent(
doc, pendingParentPathsToChildren.remove(fullDocPath)); // no longer pending
}
}

// get parent path
Expand Down Expand Up @@ -248,7 +280,18 @@ public void transform(SolrDocument rootDoc, int rootDocId, DocIterationInfo docI
assert pendingParentPathsToChildren.keySet().size() == 1;

// size == 1, so get the last remaining entry
addChildrenToParent(rootDoc, pendingParentPathsToChildren.values().iterator().next());
if (!multiValuedFLoatVectorFields.isEmpty() || !multiValuedByteVectorFields.isEmpty()) {
addFlatMultiValuedFloatVectorsToParent(
rootDoc,
pendingParentPathsToChildren.values().iterator().next(),
multiValuedFLoatVectorFields);
addFlatMultiValuedByteVectorsToParent(
rootDoc,
pendingParentPathsToChildren.values().iterator().next(),
multiValuedByteVectorFields);
} else {
addChildrenToParent(rootDoc, pendingParentPathsToChildren.values().iterator().next());
}

} catch (IOException e) {
// TODO DWS: reconsider this unusual error handling approach; shouldn't we rethrow?
Expand All @@ -257,6 +300,20 @@ public void transform(SolrDocument rootDoc, int rootDocId, DocIterationInfo docI
}
}

private Set<String> getMultiValuedVectorFields(
IndexSchema schema, SolrReturnFields childReturnFields, VectorEncoding encoding) {
Set<String> multiValuedVectorsFields = new HashSet<>();
for (String fieldName : childReturnFields.getExplicitlyRequestedFieldNames()) {
SchemaField sfield = schema.getFieldOrNull(fieldName);
if (sfield.getType() instanceof DenseVectorField
&& sfield.multiValued()
&& ((DenseVectorField) sfield.getType()).getVectorEncoding() == encoding) {
multiValuedVectorsFields.add(fieldName);
}
}
return multiValuedVectorsFields;
}

private static void addChildrenToParent(
SolrDocument parent, Map<String, List<SolrDocument>> children) {
for (Map.Entry<String, List<SolrDocument>> entry : children.entrySet()) {
Expand Down Expand Up @@ -285,6 +342,55 @@ private static void addChildrenToParent(
parent.setField(trimmedPath, children.get(0));
}

private void addFlatMultiValuedFloatVectorsToParent(
SolrDocument parent,
Map<String, List<SolrDocument>> children,
Set<String> multiValuedVectorFields) {
for (String multiValuedVectorField : multiValuedVectorFields) {
List<SolrDocument> solrDocuments = children.get(multiValuedVectorField);
List<List<Number>> multiValuedVectors = new ArrayList<>(solrDocuments.size());
for (SolrDocument singleVector : solrDocuments) {
multiValuedVectors.add(
this.extractFloatVector(singleVector.getFieldValues(multiValuedVectorField)));
}
parent.setField(multiValuedVectorField, multiValuedVectors);
}
}

private void addFlatMultiValuedByteVectorsToParent(
SolrDocument parent,
Map<String, List<SolrDocument>> children,
Set<String> multiValuedVectorFields) {
for (String multiValuedVectorField : multiValuedVectorFields) {
List<SolrDocument> solrDocuments = children.get(multiValuedVectorField);
List<List<Number>> multiValuedVectors = new ArrayList<>(solrDocuments.size());
for (SolrDocument singleVector : solrDocuments) {
multiValuedVectors.add(
this.extractByteVector(singleVector.getFieldValues(multiValuedVectorField)));
}
parent.setField(multiValuedVectorField, multiValuedVectors);
}
}

private List<Number> extractFloatVector(Collection<Object> fieldValues) {
List<Number> vector = new ArrayList<>(fieldValues.size());
for (Object fieldValue : fieldValues) {
StoredField storedVectorValue = (StoredField) fieldValue;
vector.add(storedVectorValue.numericValue());
}
return vector;
}

private List<Number> extractByteVector(Collection<Object> singleVector) {
StoredField vector = (StoredField) singleVector.iterator().next();
BytesRef byteVector = vector.binaryValue();
List<Number> extractedVector = new ArrayList<>(byteVector.length);
for (Byte element : byteVector.bytes) {
extractedVector.add(element.byteValue());
}
return extractedVector;
}

private static String getLastPath(String path) {
int lastIndexOfPathSepChar = path.lastIndexOf(PATH_SEP_CHAR);
if (lastIndexOfPathSepChar == -1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,6 @@ protected boolean enableDocValuesByDefault() {
@Override
public void checkSchemaField(final SchemaField field) throws SolrException {
super.checkSchemaField(field);
if (field.multiValued()) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
getClass().getSimpleName() + " fields can not be multiValued: " + field.getName());
}

if (field.hasDocValues()) {
throw new SolrException(
Expand Down
1 change: 1 addition & 0 deletions solr/core/src/java/org/apache/solr/schema/IndexSchema.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ public class IndexSchema {
public static final String NAME = "name";
public static final String NEST_PARENT_FIELD_NAME = "_nest_parent_";
public static final String NEST_PATH_FIELD_NAME = "_nest_path_";
public static final String NESTED_VECTORS_PSEUDO_FIELD_NAME = "_nested_vectors_";
public static final String REQUIRED = "required";
public static final String SCHEMA = "schema";
public static final String SIMILARITY = "similarity";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,20 @@

package org.apache.solr.update.processor;

import static org.apache.solr.schema.IndexSchema.NESTED_VECTORS_PSEUDO_FIELD_NAME;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.DenseVectorField;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.update.AddUpdateCommand;

/**
Expand Down Expand Up @@ -63,13 +69,15 @@ private static class NestedUpdateProcessor extends UpdateRequestProcessor {
private boolean storePath;
private boolean storeParent;
private String uniqueKeyFieldName;
private IndexSchema schema;

NestedUpdateProcessor(
SolrQueryRequest req, boolean storeParent, boolean storePath, UpdateRequestProcessor next) {
super(next);
this.storeParent = storeParent;
this.storePath = storePath;
this.uniqueKeyFieldName = req.getSchema().getUniqueKeyField().getName();
this.schema = req.getSchema();
}

@Override
Expand All @@ -81,66 +89,111 @@ public void processAdd(AddUpdateCommand cmd) throws IOException {

private boolean processDocChildren(SolrInputDocument doc, String fullPath) {
boolean isNested = false;
List<String> originalVectorFieldsToRemove = new ArrayList<>();
ArrayList<SolrInputDocument> vectors = new ArrayList<>();
for (SolrInputField field : doc.values()) {
SchemaField sfield = schema.getFieldOrNull(field.getName());
int childNum = 0;
boolean isSingleVal = !(field.getValue() instanceof Collection);
for (Object val : field) {
if (!(val instanceof SolrInputDocument cDoc)) {
// either all collection items are child docs or none are.
break;
}
final String fieldName = field.getName();

if (fieldName.contains(PATH_SEP_CHAR)) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Field name: '"
+ fieldName
+ "' contains: '"
+ PATH_SEP_CHAR
+ "' , which is reserved for the nested URP");
}
final String sChildNum = isSingleVal ? SINGULAR_VALUE_CHAR : String.valueOf(childNum);
if (!cDoc.containsKey(uniqueKeyFieldName)) {
boolean firstLevelChildren = fullPath == null;
if (firstLevelChildren && sfield != null && isMultiValuedVectorField(sfield)) {
for (Object vectorValue : field.getValues()) {
SolrInputDocument singleVectorNestedDoc = new SolrInputDocument();
singleVectorNestedDoc.setField(field.getName(), vectorValue);
final String sChildNum = isSingleVal ? SINGULAR_VALUE_CHAR : String.valueOf(childNum);
String parentDocId = doc.getField(uniqueKeyFieldName).getFirstValue().toString();
cDoc.setField(
uniqueKeyFieldName, generateChildUniqueId(parentDocId, fieldName, sChildNum));
singleVectorNestedDoc.setField(
uniqueKeyFieldName, generateChildUniqueId(parentDocId, field.getName(), sChildNum));

if (!isNested) {
isNested = true;
}
final String lastKeyPath = PATH_SEP_CHAR + field.getName() + NUM_SEP_CHAR + sChildNum;
final String childDocPath = firstLevelChildren ? lastKeyPath : fullPath + lastKeyPath;
if (storePath) {
setPathField(singleVectorNestedDoc, childDocPath);
}
if (storeParent) {
setParentKey(singleVectorNestedDoc, doc);
}
++childNum;
vectors.add(singleVectorNestedDoc);
}
if (!isNested) {
isNested = true;
originalVectorFieldsToRemove.add(field.getName());
} else {
for (Object val : field) {
if (!(val instanceof SolrInputDocument cDoc)) {
// either all collection items are child docs or none are.
break;
}
final String fieldName = field.getName();

if (fieldName.contains(PATH_SEP_CHAR)) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Field name: '"
+ fieldName
+ "' contains: '"
+ PATH_SEP_CHAR
+ "' , which is reserved for the nested URP");
}
final String sChildNum = isSingleVal ? SINGULAR_VALUE_CHAR : String.valueOf(childNum);
if (!cDoc.containsKey(uniqueKeyFieldName)) {
String parentDocId = doc.getField(uniqueKeyFieldName).getFirstValue().toString();
cDoc.setField(
uniqueKeyFieldName, generateChildUniqueId(parentDocId, fieldName, sChildNum));
}
if (!isNested) {
isNested = true;
}
final String lastKeyPath = PATH_SEP_CHAR + fieldName + NUM_SEP_CHAR + sChildNum;
// concat of all paths children.grandChild => /children#1/grandChild#
final String childDocPath = firstLevelChildren ? lastKeyPath : fullPath + lastKeyPath;
processChildDoc(cDoc, doc, childDocPath);
++childNum;
}
final String lastKeyPath = PATH_SEP_CHAR + fieldName + NUM_SEP_CHAR + sChildNum;
// concat of all paths children.grandChild => /children#1/grandChild#
final String childDocPath = fullPath == null ? lastKeyPath : fullPath + lastKeyPath;
processChildDoc(cDoc, doc, childDocPath);
++childNum;
}
}
this.cleanOriginalVectorFields(doc, originalVectorFieldsToRemove);
if (vectors.size() > 0) {
doc.setField(NESTED_VECTORS_PSEUDO_FIELD_NAME, vectors);
}
return isNested;
}

private void cleanOriginalVectorFields(
SolrInputDocument doc, List<String> originalVectorFieldsToRemove) {
for (String fieldName : originalVectorFieldsToRemove) {
doc.removeField(fieldName);
}
}

private static boolean isMultiValuedVectorField(SchemaField sfield) {
return sfield.getType() instanceof DenseVectorField && sfield.multiValued();
}

private void processChildDoc(
SolrInputDocument sdoc, SolrInputDocument parent, String fullPath) {
SolrInputDocument child, SolrInputDocument parent, String fullPath) {
if (storePath) {
setPathField(sdoc, fullPath);
setPathField(child, fullPath);
}
if (storeParent) {
setParentKey(sdoc, parent);
setParentKey(child, parent);
}
processDocChildren(sdoc, fullPath);
processDocChildren(child, fullPath);
}

private String generateChildUniqueId(String parentId, String childKey, String childNum) {
// combines parentId with the child's key and childNum. e.g. "10/footnote#1"
return parentId + PATH_SEP_CHAR + childKey + NUM_SEP_CHAR + childNum;
}

private void setParentKey(SolrInputDocument sdoc, SolrInputDocument parent) {
sdoc.setField(IndexSchema.NEST_PARENT_FIELD_NAME, parent.getFieldValue(uniqueKeyFieldName));
private void setParentKey(SolrInputDocument child, SolrInputDocument parent) {
child.setField(IndexSchema.NEST_PARENT_FIELD_NAME, parent.getFieldValue(uniqueKeyFieldName));
}

private void setPathField(SolrInputDocument sdoc, String fullPath) {
sdoc.setField(IndexSchema.NEST_PATH_FIELD_NAME, fullPath);
private void setPathField(SolrInputDocument child, String fullPath) {
child.setField(IndexSchema.NEST_PATH_FIELD_NAME, fullPath);
}
}
}
Loading
Loading