Skip to content

Commit 39bbe03

Browse files
committed
Split/Splice: server-advertised chunking algorithms
Introduce ChunkingFunction which enum is a set of known chunking algorithms that the server can recommend to the client. Provide FastCDC_2020 as the first explicit chunking algorithm. The server advertise these through a new chunking_configuration field in CacheCapabilities message. There, the server may set the chunking functions that it supports as well as the relevant configuration parameters for that chunking algorithm.
1 parent 9ef19c6 commit 39bbe03

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed

build/bazel/remote/execution/v2/remote_execution.proto

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1976,6 +1976,12 @@ message SplitBlobRequest {
19761976
// length of the blob digest hashes and the digest functions announced
19771977
// in the server's capabilities.
19781978
DigestFunction.Value digest_function = 3;
1979+
1980+
// The chunking function that the client prefers to use.
1981+
//
1982+
// The server MAY use a different chunking function. The client MUST check
1983+
// the chunking function used in the response.
1984+
ChunkingFunction.Value chunking_function = 4;
19791985
}
19801986

19811987
// A response message for
@@ -1988,6 +1994,9 @@ message SplitBlobResponse {
19881994
// The server MUST use the same digest function as the one explicitly or
19891995
// implicitly (through hash length) specified in the split request.
19901996
repeated Digest chunk_digests = 1;
1997+
1998+
// The chunking function used to split the blob.
1999+
ChunkingFunction.Value chunking_function = 2;
19912000
}
19922001

19932002
// A request message for
@@ -2023,6 +2032,9 @@ message SpliceBlobRequest {
20232032
// server SHOULD infer the digest function using the length of the blob digest
20242033
// hashes and the digest functions announced in the server's capabilities.
20252034
DigestFunction.Value digest_function = 4;
2035+
2036+
// The chunking function that the client used to split the blob.
2037+
ChunkingFunction.Value chunking_function = 5;
20262038
}
20272039

20282040
// A response message for
@@ -2158,6 +2170,49 @@ message DigestFunction {
21582170
}
21592171
}
21602172

2173+
// The chunking function is used to split a blob into chunks.
2174+
//
2175+
// The server SHOULD advertise the chunking functions it supports via the
2176+
// [CacheCapabilities.supported_chunking_algorithms][build.bazel.remote.execution.v2.CacheCapabilities.supported_chunking_algorithms]
2177+
// field.
2178+
//
2179+
// The client SHOULD use one of the chunking functions advertised by the server.
2180+
//
2181+
// When blob splitting and splicing is used at the same time, the clients and
2182+
// the server SHOULD agree out-of-band upon a chunking algorithm used by both
2183+
// parties to benefit from each others chunk data and avoid unnecessary data
2184+
// duplication.
2185+
message ChunkingFunction {
2186+
enum Value {
2187+
// A generic chunking function. If a server supports blob splitting/splicing
2188+
// and advertises this value, it can be safely assumed that the original
2189+
// blob can be recreated by concatenating the chunks. No other assumptions
2190+
// about the chunking algorithm can be made.
2191+
UNKNOWN = 0;
2192+
2193+
// This is a variant of the FastCDC chunking algorithm as described in the
2194+
// 2020 paper by Wen Xia, et al.
2195+
// See https://ieeexplore.ieee.org/document/9055082 for details.
2196+
// Reference implementation could be found in the Rust library
2197+
// https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/index.html
2198+
// with the gear tables available at
2199+
// https://github.com/nlfiedler/fastcdc-rs/blob/3.2.1/src/v2020/mod.rs
2200+
//
2201+
// Server which supports this chunking function MUST advertise the following
2202+
// configuration parameters through the CacheCapabilities message:
2203+
// - normalization_level
2204+
// - min_chunk_size_bytes
2205+
// - avg_chunk_size_bytes
2206+
// - max_chunk_size_bytes
2207+
// - seed
2208+
//
2209+
// Client MUST use these advertised parameters to setup the FastCDC chunker.
2210+
// The remaining parameters, such as mask_s, mask_l can be derrived from the
2211+
// average chunk size parameter.
2212+
FASTCDC_2020 = 1;
2213+
}
2214+
}
2215+
21612216
// Describes the server/instance capabilities for updating the action cache.
21622217
message ActionCacheUpdateCapabilities {
21632218
bool update_enabled = 1;
@@ -2279,6 +2334,50 @@ message CacheCapabilities {
22792334
// [ContentAddressableStorage.SpliceBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SpliceBlob]
22802335
// operation.
22812336
bool blob_splice_support = 10;
2337+
2338+
// The chunking configuration of the server.
2339+
ChunkingConfiguration chunking_configuration = 11;
2340+
}
2341+
2342+
// The chunking configuration of the server.
2343+
message ChunkingConfiguration {
2344+
// If any of the advertised parameters are not within the expected range,
2345+
// the client SHOULD ignore FastCDC chunking function support.
2346+
message FastCDCParams {
2347+
// The normalization level for the FastCDC chunking algorithm.
2348+
// The value MUST be between 0 and 3.
2349+
uint32 normalization_level = 1;
2350+
2351+
// The minimum chunk size for the FastCDC chunking algorithm.
2352+
// The value MUST be between 256 bytes and 64 KiB.
2353+
uint64 min_chunk_size_bytes = 2;
2354+
2355+
// The average chunk size for the FastCDC chunking algorithm.
2356+
// The value MUST be between 1 KiB and 256 KiB.
2357+
uint64 avg_chunk_size_bytes = 3;
2358+
2359+
// The maximum chunk size for the FastCDC chunking algorithm.
2360+
// The value MUST be between 4 KiB and 4 MiB.
2361+
uint64 max_chunk_size_bytes = 4;
2362+
2363+
// The seed for the FastCDC chunking algorithm.
2364+
uint32 seed = 5;
2365+
}
2366+
2367+
// A list of chunking algorithms that the server supports for splitting and
2368+
// splicing blobs.
2369+
repeated ChunkingFunction.Value supported_chunking_algorithms = 1;
2370+
2371+
// The minimum blob size that should be considered for chunking.
2372+
// Blobs smaller than this threshold SHOULD be sent as single blobs.
2373+
// If unset, clients SHOULD use max_cas_blob_size_bytes as the
2374+
// minimum blob size for chunking.
2375+
// If both this field and max_cas_blob_size_bytes are unset, clients
2376+
// MAY chunk blobs of any size.
2377+
uint64 min_blob_size_for_chunking_bytes = 2;
2378+
2379+
// The parameters for the FastCDC chunking algorithm.
2380+
FastCDCParams fastcdc_params = 3;
22822381
}
22832382

22842383
// Capabilities of the remote execution system.

0 commit comments

Comments
 (0)