1818#include < boost/random/uniform_int_distribution.hpp>
1919
2020#include < re2/re2.h>
21+ #include < algorithm>
2122#include < unordered_set>
2223#include " velox/common/base/Fs.h"
2324#include " velox/common/encode/Base64.h"
@@ -108,12 +109,25 @@ class WriterFuzzer {
108109 // zero-based ordinal number of the column.
109110 // Data types is chosen from '<columnTypes>' and for nested complex data type,
110111 // maxDepth limits the max layers of nesting.
112+ // Offset represents the number of columns which has already been generated.
113+ // The function will generate the remaining columns starting from this index.
111114 std::vector<std::string> generateColumns (
112115 int32_t maxNumColumns,
113116 const std::string& prefix,
114117 const std::vector<TypePtr>& dataTypes,
115118 int32_t maxDepth,
116119 std::vector<std::string>& names,
120+ std::vector<TypePtr>& types,
121+ int32_t offset = 0 );
122+
123+ // Generates at least one and up to maxNumColumns columns
124+ // with a random number of those columns overlapping as bucket by columns.
125+ // Returns sorted column names and the start offset of generated sort columns.
126+ // The overlapped bucketed columns are listed first.
127+ std::tuple<std::vector<std::string>, int > generateSortColumns (
128+ int32_t maxNumColumns,
129+ const std::vector<std::string>& bucketColumns,
130+ std::vector<std::string>& names,
117131 std::vector<TypePtr>& types);
118132
119133 // Generates input data for table write.
@@ -342,12 +356,12 @@ void WriterFuzzer::go() {
342356 bucketCount =
343357 boost::random::uniform_int_distribution<int32_t >(1 , 3 )(rng_);
344358
345- // TODO: sort columns can overlap as bucket columns
346359 // 50% of times test ordered write.
347360 if (vectorFuzzer_.coinToss (0.5 )) {
348361 sortColumnOffset = names.size ();
349- auto sortColumns = generateColumns (
350- 3 , " s" , kSupportedSortColumnTypes_ , 1 , names, types);
362+ auto [sortColumns, offset] =
363+ generateSortColumns (3 , bucketColumns, names, types);
364+ sortColumnOffset -= offset;
351365 sortBy.reserve (sortColumns.size ());
352366 for (const auto & sortByColumn : sortColumns) {
353367 sortBy.push_back (std::make_shared<const HiveSortingColumn>(
@@ -392,11 +406,12 @@ std::vector<std::string> WriterFuzzer::generateColumns(
392406 const std::vector<TypePtr>& dataTypes,
393407 int32_t maxDepth,
394408 std::vector<std::string>& names,
395- std::vector<TypePtr>& types) {
409+ std::vector<TypePtr>& types,
410+ const int32_t offset) {
396411 const auto numColumns =
397412 boost::random::uniform_int_distribution<uint32_t >(1 , maxNumColumns)(rng_);
398413 std::vector<std::string> columns;
399- for (auto i = 0 ; i < numColumns; ++i) {
414+ for (auto i = offset ; i < numColumns; ++i) {
400415 columns.push_back (fmt::format (" {}{}" , prefix, i));
401416
402417 // Pick random, possibly complex, type.
@@ -406,6 +421,41 @@ std::vector<std::string> WriterFuzzer::generateColumns(
406421 return columns;
407422}
408423
424+ std::tuple<std::vector<std::string>, int > WriterFuzzer::generateSortColumns (
425+ int32_t maxNumColumns,
426+ const std::vector<std::string>& bucketColumns,
427+ std::vector<std::string>& names,
428+ std::vector<TypePtr>& types) {
429+ // A random number of sort columns will overlap as bucket columns, which are
430+ // already generated
431+ const auto maxOverlapColumns = std::min<int32_t >(
432+ maxNumColumns, static_cast <int32_t >(bucketColumns.size ()));
433+ const auto numOverlapColumns =
434+ static_cast <int32_t >(boost::random::uniform_int_distribution<uint32_t >(
435+ 0 , maxOverlapColumns)(rng_));
436+
437+ std::vector<std::string> columns (
438+ bucketColumns.end () - numOverlapColumns, bucketColumns.end ());
439+
440+ // Remaining columns which do not overlap as bucket by columns are added as
441+ // new columns with prefix "s"
442+ const auto remainingColumns = maxNumColumns - numOverlapColumns;
443+ if (remainingColumns > 0 ) {
444+ auto nonOverlapColumns = generateColumns (
445+ remainingColumns,
446+ " s" ,
447+ kSupportedSortColumnTypes_ ,
448+ 1 ,
449+ names,
450+ types,
451+ numOverlapColumns);
452+ columns.insert (
453+ columns.end (), nonOverlapColumns.begin (), nonOverlapColumns.end ());
454+ }
455+
456+ return {columns, numOverlapColumns};
457+ }
458+
409459std::vector<RowVectorPtr> WriterFuzzer::generateInputData (
410460 std::vector<std::string> names,
411461 std::vector<TypePtr> types,
0 commit comments