Skip to content

Commit 710d449

Browse files
HeidiHan0000facebook-github-bot
authored andcommitted
feat(fuzzer): Allow bucket columns to overlap as sort columns in writer fuzzer (facebookincubator#12007)
Summary: Allow bucket columns to overlap as sort columns by using some of the bucket columns as sort columns and generating the rest of the sort columns. Pull Request resolved: facebookincubator#12007 Reviewed By: kewang1024 Differential Revision: D67775105 Pulled By: HeidiHan0000 fbshipit-source-id: a2b772c9afa2c2d3317030ca20a829572d2cb9f8
1 parent e383017 commit 710d449

File tree

1 file changed

+55
-5
lines changed

1 file changed

+55
-5
lines changed

velox/exec/fuzzer/WriterFuzzer.cpp

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <boost/random/uniform_int_distribution.hpp>
1919

2020
#include <re2/re2.h>
21+
#include <algorithm>
2122
#include <unordered_set>
2223
#include "velox/common/base/Fs.h"
2324
#include "velox/common/encode/Base64.h"
@@ -108,12 +109,25 @@ class WriterFuzzer {
108109
// zero-based ordinal number of the column.
109110
// Data types is chosen from '<columnTypes>' and for nested complex data type,
110111
// maxDepth limits the max layers of nesting.
112+
// Offset represents the number of columns which has already been generated.
113+
// The function will generate the remaining columns starting from this index.
111114
std::vector<std::string> generateColumns(
112115
int32_t maxNumColumns,
113116
const std::string& prefix,
114117
const std::vector<TypePtr>& dataTypes,
115118
int32_t maxDepth,
116119
std::vector<std::string>& names,
120+
std::vector<TypePtr>& types,
121+
int32_t offset = 0);
122+
123+
// Generates at least one and up to maxNumColumns columns
124+
// with a random number of those columns overlapping as bucket by columns.
125+
// Returns sorted column names and the start offset of generated sort columns.
126+
// The overlapped bucketed columns are listed first.
127+
std::tuple<std::vector<std::string>, int> generateSortColumns(
128+
int32_t maxNumColumns,
129+
const std::vector<std::string>& bucketColumns,
130+
std::vector<std::string>& names,
117131
std::vector<TypePtr>& types);
118132

119133
// Generates input data for table write.
@@ -342,12 +356,12 @@ void WriterFuzzer::go() {
342356
bucketCount =
343357
boost::random::uniform_int_distribution<int32_t>(1, 3)(rng_);
344358

345-
// TODO: sort columns can overlap as bucket columns
346359
// 50% of times test ordered write.
347360
if (vectorFuzzer_.coinToss(0.5)) {
348361
sortColumnOffset = names.size();
349-
auto sortColumns = generateColumns(
350-
3, "s", kSupportedSortColumnTypes_, 1, names, types);
362+
auto [sortColumns, offset] =
363+
generateSortColumns(3, bucketColumns, names, types);
364+
sortColumnOffset -= offset;
351365
sortBy.reserve(sortColumns.size());
352366
for (const auto& sortByColumn : sortColumns) {
353367
sortBy.push_back(std::make_shared<const HiveSortingColumn>(
@@ -392,11 +406,12 @@ std::vector<std::string> WriterFuzzer::generateColumns(
392406
const std::vector<TypePtr>& dataTypes,
393407
int32_t maxDepth,
394408
std::vector<std::string>& names,
395-
std::vector<TypePtr>& types) {
409+
std::vector<TypePtr>& types,
410+
const int32_t offset) {
396411
const auto numColumns =
397412
boost::random::uniform_int_distribution<uint32_t>(1, maxNumColumns)(rng_);
398413
std::vector<std::string> columns;
399-
for (auto i = 0; i < numColumns; ++i) {
414+
for (auto i = offset; i < numColumns; ++i) {
400415
columns.push_back(fmt::format("{}{}", prefix, i));
401416

402417
// Pick random, possibly complex, type.
@@ -406,6 +421,41 @@ std::vector<std::string> WriterFuzzer::generateColumns(
406421
return columns;
407422
}
408423

424+
std::tuple<std::vector<std::string>, int> WriterFuzzer::generateSortColumns(
425+
int32_t maxNumColumns,
426+
const std::vector<std::string>& bucketColumns,
427+
std::vector<std::string>& names,
428+
std::vector<TypePtr>& types) {
429+
// A random number of sort columns will overlap as bucket columns, which are
430+
// already generated
431+
const auto maxOverlapColumns = std::min<int32_t>(
432+
maxNumColumns, static_cast<int32_t>(bucketColumns.size()));
433+
const auto numOverlapColumns =
434+
static_cast<int32_t>(boost::random::uniform_int_distribution<uint32_t>(
435+
0, maxOverlapColumns)(rng_));
436+
437+
std::vector<std::string> columns(
438+
bucketColumns.end() - numOverlapColumns, bucketColumns.end());
439+
440+
// Remaining columns which do not overlap as bucket by columns are added as
441+
// new columns with prefix "s"
442+
const auto remainingColumns = maxNumColumns - numOverlapColumns;
443+
if (remainingColumns > 0) {
444+
auto nonOverlapColumns = generateColumns(
445+
remainingColumns,
446+
"s",
447+
kSupportedSortColumnTypes_,
448+
1,
449+
names,
450+
types,
451+
numOverlapColumns);
452+
columns.insert(
453+
columns.end(), nonOverlapColumns.begin(), nonOverlapColumns.end());
454+
}
455+
456+
return {columns, numOverlapColumns};
457+
}
458+
409459
std::vector<RowVectorPtr> WriterFuzzer::generateInputData(
410460
std::vector<std::string> names,
411461
std::vector<TypePtr> types,

0 commit comments

Comments
 (0)