diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000000..5b36ac93d48d --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,96 @@ +Checks: > + *, + -abseil-*, + -android-*, + -cert-err58-cpp, + -cert-err58-cpp, + -clang-analyzer-osx-*, + -cppcoreguidelines-avoid-c-arrays, + -cppcoreguidelines-avoid-goto, + -cppcoreguidelines-avoid-magic-numbers, + -cppcoreguidelines-avoid-non-const-global-variables, + -cppcoreguidelines-owning-memory, + -cppcoreguidelines-pro-bounds-array-to-pointer-decay, + -cppcoreguidelines-pro-bounds-pointer-arithmetic, + -cppcoreguidelines-pro-type-reinterpret-cast, + -cppcoreguidelines-pro-type-vararg, + -cppcoreguidelines-pro-type-vararg, + -cppcoreguidelines-special-member-functions, + -fuchsia-*, + -google-*, + -hicpp-avoid-c-arrays, + -hicpp-avoid-goto, + -hicpp-deprecated-headers, + -hicpp-no-array-decay, + -hicpp-special-member-functions, + -hicpp-use-equals-default, + -hicpp-vararg, + -hicpp-vararg, + -llvm-header-guard, + -llvm-include-order, + -llvmlibc-*, + -misc-no-recursion, + -misc-no-recursion, + -misc-non-private-member-variables-in-classes, + -misc-unused-parameters, + -modernize-avoid-c-arrays, + -modernize-deprecated-headers, + -modernize-use-nodiscard, + -modernize-use-trailing-return-type, + -mpi-*, + -objc-*, + -openmp-*, + -readability-avoid-const-params-in-decls, + -readability-convert-member-functions-to-static, + -readability-implicit-bool-conversion, + -readability-magic-numbers, + -zircon-*, + +HeaderFilterRegex: '.*' + +WarningsAsErrors: '' + +CheckOptions: + # Naming conventions as explicitly stated in CODING_STYLE.md + - key: readability-identifier-naming.ClassCase + value: CamelCase + - key: readability-identifier-naming.StructCase + value: CamelCase + - key: readability-identifier-naming.EnumCase + value: CamelCase + - key: readability-identifier-naming.TypeAliasCase + value: CamelCase + - key: readability-identifier-naming.TypeTemplateParameterCase + value: CamelCase + - key: readability-identifier-naming.FunctionCase + value: camelBack + - key: readability-identifier-naming.VariableCase + value: camelBack + - key: readability-identifier-naming.ParameterCase + value: camelBack + - key: readability-identifier-naming.PrivateMemberCase + value: camelBack + - key: readability-identifier-naming.PrivateMemberSuffix + value: _ + - key: readability-identifier-naming.ProtectedMemberCase + value: camelBack + - key: readability-identifier-naming.ProtectedMemberSuffix + value: _ + - key: readability-identifier-naming.MacroDefinitionCase + value: UPPER_CASE + - key: readability-identifier-naming.NamespaceCase + value: lower_case + - key: readability-identifier-naming.StaticConstantPrefix + value: k + - key: readability-identifier-naming.EnumConstantCase + value: CamelCase + - key: readability-identifier-naming.EnumConstantPrefix + value: k + + # Use nullptr instead of NULL or 0 + - key: modernize-use-nullptr.NullMacros + value: 'NULL' + + # Prefer enum class over enum + - key: modernize-use-using.IgnoreUsingStdAllocator + value: 1 diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 91c373bf6b3b..bbbd89f433a5 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -24,46 +24,46 @@ format: separate_ctrl_name_with_space: false separate_fn_name_with_space: false dangle_parens: false - command_case: "canonical" - keyword_case: "unchanged" + command_case: canonical + keyword_case: unchanged always_wrap: - set_target_properties - target_sources - target_link_libraries parse: - # We define these for our custom + # We define these for our custom # functions so they get formatted correctly additional_commands: velox_add_library: pargs: nargs: 1+ flags: - - OBJECT - - STATIC - - SHARED - - INTERFACE + - OBJECT + - STATIC + - SHARED + - INTERFACE kwargs: {} velox_base_add_library: pargs: nargs: 1+ flags: - - OBJECT - - STATIC - - SHARED - - INTERFACE + - OBJECT + - STATIC + - SHARED + - INTERFACE kwargs: {} velox_compile_definitions: - pargs: 1 + pargs: 1 kwargs: PRIVATE: '*' PUBLIC: '*' INTERFACE: '*' velox_include_directories: - pargs: '1+' + pargs: 1+ flags: - SYSTEM - BEFORE @@ -74,11 +74,10 @@ parse: INTERFACE: '*' velox_link_libraries: - pargs: '1+' + pargs: 1+ kwargs: PRIVATE: '*' PUBLIC: '*' INTERFACE: '*' - markup: first_comment_is_literal: true diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 30d7ee15cd85..223009a6f866 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -16,7 +16,7 @@ # request a review from owners on PRs with changes to matching files. # We currently do not enforce these reviews as required so it's only a tool # for more granular notifications at the moment. For example component maintainers -# can set a rule so that they are pinged on changes to the sections of the +# can set a rule so that they are pinged on changes to the sections of the # codebase that are relevant for their component. # Only users that have write access to the repo can be added as owners. @@ -29,7 +29,7 @@ CMake/ @assignUser @majetideepak scripts/ @assignUser @majetideepak .github/ @assignUser @majetideepak -# Breeze +# Breeze velox/experimental/breeze @dreveman # cuDF @@ -45,4 +45,4 @@ velox/connectors/hive/storage_adapters/ @majetideepak velox/connectors/ @majetideepak # Caching -velox/common/caching/ @majetideepak +velox/common/caching/ @majetideepak diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml index 9fc52567b5bb..0fd30969d329 100644 --- a/.github/ISSUE_TEMPLATE/bug.yml +++ b/.github/ISSUE_TEMPLATE/bug.yml @@ -14,7 +14,7 @@ name: Bug Report description: Report a bug or unexpected behavior. -labels: ["bug", "triage"] +labels: [bug, triage] body: - type: markdown attributes: @@ -26,7 +26,7 @@ body: attributes: label: Bug description description: Please describe the issue and the expected behavior. - value: "[Expected behavior] and [actual behavior]." + value: '[Expected behavior] and [actual behavior].' validations: required: true - type: textarea diff --git a/.github/ISSUE_TEMPLATE/build.yml b/.github/ISSUE_TEMPLATE/build.yml index f02dbc8509dc..9b560541c9e2 100644 --- a/.github/ISSUE_TEMPLATE/build.yml +++ b/.github/ISSUE_TEMPLATE/build.yml @@ -14,7 +14,7 @@ name: Build problem description: Report an issue when building Velox. -labels: ["build", "triage"] +labels: [build, triage] body: - type: markdown attributes: @@ -26,7 +26,7 @@ body: attributes: label: Problem description description: Please describe the problem. - value: "Please describe how you were trying to build velox and what issue occured" + value: Please describe how you were trying to build velox and what issue occured validations: required: true - type: textarea diff --git a/.github/ISSUE_TEMPLATE/enhancement.yml b/.github/ISSUE_TEMPLATE/enhancement.yml index 89cd7ac5864f..dcad3a4068d3 100644 --- a/.github/ISSUE_TEMPLATE/enhancement.yml +++ b/.github/ISSUE_TEMPLATE/enhancement.yml @@ -14,7 +14,7 @@ name: Enhancement description: Raise a potential enhancement. -labels: ["enhancement"] +labels: [enhancement] body: - type: markdown attributes: diff --git a/.github/ISSUE_TEMPLATE/fuzzer.yml b/.github/ISSUE_TEMPLATE/fuzzer.yml index 8572fcfcad86..8275ae049a23 100644 --- a/.github/ISSUE_TEMPLATE/fuzzer.yml +++ b/.github/ISSUE_TEMPLATE/fuzzer.yml @@ -14,7 +14,7 @@ name: Fuzzer Report description: Report an issue with the fuzzer or found through fuzzing. -labels: ["bug", "fuzzer-found", "fuzzer"] +labels: [bug, fuzzer-found, fuzzer] body: - type: markdown attributes: @@ -26,7 +26,7 @@ body: attributes: label: Description description: Please describe the issue. - placeholder: "[Expected behavior] and [actual behavior]." + placeholder: '[Expected behavior] and [actual behavior].' validations: required: true - type: textarea diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7adedbe9a4bd..e00dd70cc9a3 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -14,9 +14,9 @@ version: 2 updates: - - package-ecosystem: "github-actions" - directory: "/" + - package-ecosystem: github-actions + directory: / schedule: - interval: "weekly" + interval: weekly commit-message: - prefix: "build(ci): " + prefix: 'build(ci): ' diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 53d2b202700f..a641fd5311f9 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: "Ubuntu Benchmark" +name: Ubuntu Benchmark on: pull_request: paths: - - '.github/workflows/benchmark.yml' - - 'scripts/ci/benchmark-requirements.txt' - - 'scripts/ci/setup-ubuntu.sh' + - .github/workflows/benchmark.yml + - scripts/ci/benchmark-requirements.txt + - scripts/setup-ubuntu.sh push: branches: [main] @@ -40,58 +40,51 @@ jobs: if: github.repository == 'facebookincubator/velox' runs-on: 8-core-ubuntu-22.04 env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" - BINARY_DIR: "${{ github.workspace }}/benchmarks/" - CONTENDER_OUTPUT_PATH: "${{ github.workspace }}/benchmark-results/contender/" - INSTALL_PREFIX: "${{ github.workspace }}/dependencies" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + CCACHE_BASEDIR: ${{ github.workspace }} + BINARY_DIR: ${{ github.workspace }}/benchmarks/ + CONTENDER_OUTPUT_PATH: ${{ github.workspace }}/benchmark-results/contender/ + INSTALL_PREFIX: ${{ github.workspace }}/dependencies steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - path: 'velox' - submodules: 'recursive' + path: velox + persist-credentials: false - name: Restore Dependencies uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 id: restore-deps with: - path: "${{ env.INSTALL_PREFIX }}" + path: ${{ env.INSTALL_PREFIX }} key: dependencies-benchmark-${{ hashFiles('velox/scripts/setup-ubuntu.sh') }} - name: Install apt dependencies run: | - source velox/scripts/setup-ubuntu.sh - install_apt_deps + source velox/scripts/setup-ubuntu.sh + install_apt_deps - name: Install compiled dependencies if: ${{ steps.restore-deps.outputs.cache-hit != 'true' }} env: - CCACHE_DISABLE: "true" - run: | - source velox/scripts/setup-ubuntu.sh - run_and_time install_fmt - run_and_time install_protobuf - run_and_time install_boost - run_and_time install_fast_float - run_and_time install_folly - run_and_time install_stemmer - run_and_time install_thrift - run_and_time install_arrow - - # Folly is built with a new flag that the cache may not have. Ensure velox build and folly have - # used the correct flags (disabled the coroutines) - - name: 'TEMP: Force folly rebuild and update cache' + CCACHE_DISABLE: 'true' run: | - source velox/scripts/setup-ubuntu.sh - run_and_time install_folly + source velox/scripts/setup-ubuntu.sh + run_and_time install_fmt + run_and_time install_protobuf + run_and_time install_boost + run_and_time install_fast_float + run_and_time install_folly + run_and_time install_stemmer + run_and_time install_thrift + run_and_time install_arrow - name: Save Dependencies if: ${{ steps.restore-deps.outputs.cache-hit != 'true' }} uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: - path: "${{ env.INSTALL_PREFIX }}" + path: ${{ env.INSTALL_PREFIX }} key: dependencies-benchmark-${{ hashFiles('velox/scripts/setup-ubuntu.sh') }} - name: Restore ccache @@ -99,7 +92,7 @@ jobs: id: restore-cache with: clean: true - path: "ccache" + path: ccache key: ccache-benchmark - name: Clear CCache Statistics @@ -109,10 +102,10 @@ jobs: - name: Build Contender Benchmarks working-directory: velox run: | - n_cores=$(nproc) - make benchmarks-basic-build NUM_THREADS=$n_cores MAX_HIGH_MEM_JOBS=$n_cores MAX_LINK_JOBS=$n_cores - mkdir -p ${BINARY_DIR}/contender/ - cp -r --verbose _build/release/velox/benchmarks/basic/* ${BINARY_DIR}/contender/ + n_cores=$(nproc) + make benchmarks-basic-build NUM_THREADS=$n_cores MAX_HIGH_MEM_JOBS=$n_cores MAX_LINK_JOBS=$n_cores + mkdir -p ${BINARY_DIR}/contender/ + cp -r --verbose _build/release/velox/benchmarks/basic/* ${BINARY_DIR}/contender/ - name: CCache after run: | @@ -121,7 +114,7 @@ jobs: - name: Save ccache" uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: "ccache" + path: ccache key: ccache-benchmark - name: Install benchmark dependencies @@ -129,16 +122,16 @@ jobs: python3 -m pip install -r velox/scripts/ci/benchmark-requirements.txt - name: Run Benchmarks - Contender - working-directory: 'velox' + working-directory: velox run: | - make benchmarks-basic-run \ - EXTRA_BENCHMARK_FLAGS="--binary_path ${BINARY_DIR}/contender/ --output_path ${CONTENDER_OUTPUT_PATH}" + make benchmarks-basic-run \ + EXTRA_BENCHMARK_FLAGS="--binary_path ${BINARY_DIR}/contender/ --output_path ${CONTENDER_OUTPUT_PATH}" - name: Upload result artifact uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: - path: "benchmark-results" - name: "benchmark-results" + path: benchmark-results + name: benchmark-results retention-days: 5 upload: @@ -150,80 +143,80 @@ jobs: statuses: write steps: - - name: Download artifacts - uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9 - with: - merge-multiple: true - path: /tmp/artifacts/ - - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - path: velox - persist-credentials: false - - - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: '3.10' - cache: 'pip' - cache-dependency-path: "velox/scripts/*" - - - name: Install dependencies - run: pip install -r velox/scripts/ci/benchmark-requirements.txt - - - name: Upload results - env: - CONBENCH_URL: "https://velox-conbench.voltrondata.run/" - CONBENCH_MACHINE_INFO_NAME: "GitHub-runner-8-core" - CONBENCH_EMAIL: "${{ secrets.CONBENCH_EMAIL }}" - CONBENCH_PASSWORD: "${{ secrets.CONBENCH_PASSWORD }}" - CONBENCH_PROJECT_REPOSITORY: "${{ github.repository }}" - CONBENCH_PROJECT_COMMIT: "${{ github.sha }}" - RUN_ID: "GHA-${{ github.run_id }}-${{ github.run_attempt }}" - run: | - ./velox/scripts/ci/benchmark-runner.py upload \ - --run_id "$RUN_ID" \ - --sha "$CONBENCH_PROJECT_COMMIT" \ - --output_dir "/tmp/artifacts/contender/" - - - name: Check the status of the upload - # Status functions like failure() only work in `if:` - if: failure() - id: status - run: echo "failed=true" >> $GITHUB_OUTPUT - - - name: Create a GitHub Status on the contender commit (whether the upload was successful) - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - if: ${{ !cancelled() && steps.extract.conclusion != 'failure' }} - with: - script: | - let url = 'https://github.com/${{github.repository}}/actions/runs/${{ github.run_id }}' - let state = 'success' - let description = 'Result upload succeeded!' - - if(${{ steps.status.outputs.failed == 'true' && true || false }}) { - state = 'failure' - description = 'Result upload failed!' - } - - github.rest.repos.createCommitStatus({ - owner: context.repo.owner, - repo: context.repo.repo, - sha: context.sha, - state: state, - target_url: url, - description: description, - context: 'Benchmark Result Upload' - }) - - - name: Create a GitHub Check benchmark report on the merged PR - env: - CONBENCH_URL: "https://velox-conbench.voltrondata.run/" - GITHUB_APP_ID: "${{ secrets.GH_APP_ID }}" - GITHUB_APP_PRIVATE_KEY: "${{ secrets.GH_APP_PRIVATE_KEY }}" - COMMIT_MESSAGE: "${{ github.event.head_commit.message }}" - CONTENDER_SHA: "${{ github.sha }}" - run: | - ./velox/scripts/ci/benchmark-alert.py \ - --contender-sha "$CONTENDER_SHA" \ - --merge-commit-message "$COMMIT_MESSAGE" \ - --z-score-threshold 50 + - name: Download artifacts + uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9 + with: + merge-multiple: true + path: /tmp/artifacts/ + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + path: velox + persist-credentials: false + + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: '3.10' + cache: pip + cache-dependency-path: velox/scripts/* + + - name: Install dependencies + run: pip install -r velox/scripts/ci/benchmark-requirements.txt + + - name: Upload results + env: + CONBENCH_URL: https://velox-conbench.voltrondata.run/ + CONBENCH_MACHINE_INFO_NAME: GitHub-runner-8-core + CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }} + CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASSWORD }} + CONBENCH_PROJECT_REPOSITORY: ${{ github.repository }} + CONBENCH_PROJECT_COMMIT: ${{ github.sha }} + RUN_ID: GHA-${{ github.run_id }}-${{ github.run_attempt }} + run: | + ./velox/scripts/ci/benchmark-runner.py upload \ + --run_id "$RUN_ID" \ + --sha "$CONBENCH_PROJECT_COMMIT" \ + --output_dir "/tmp/artifacts/contender/" + + - name: Check the status of the upload + # Status functions like failure() only work in `if:` + if: failure() + id: status + run: echo "failed=true" >> $GITHUB_OUTPUT + + - name: Create a GitHub Status on the contender commit (whether the upload was successful) + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + if: ${{ !cancelled() && steps.extract.conclusion != 'failure' }} + with: + script: | + let url = 'https://github.com/${{github.repository}}/actions/runs/${{ github.run_id }}' + let state = 'success' + let description = 'Result upload succeeded!' + + if(${{ steps.status.outputs.failed == 'true' && true || false }}) { + state = 'failure' + description = 'Result upload failed!' + } + + github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: context.sha, + state: state, + target_url: url, + description: description, + context: 'Benchmark Result Upload' + }) + + - name: Create a GitHub Check benchmark report on the merged PR + env: + CONBENCH_URL: https://velox-conbench.voltrondata.run/ + GITHUB_APP_ID: ${{ secrets.GH_APP_ID }} + GITHUB_APP_PRIVATE_KEY: ${{ secrets.GH_APP_PRIVATE_KEY }} + COMMIT_MESSAGE: ${{ github.event.head_commit.message }} + CONTENDER_SHA: ${{ github.sha }} + run: | + ./velox/scripts/ci/benchmark-alert.py \ + --contender-sha "$CONTENDER_SHA" \ + --merge-commit-message "$COMMIT_MESSAGE" \ + --z-score-threshold 50 diff --git a/.github/workflows/breeze.yml b/.github/workflows/breeze.yml index 0d778f6cac86..57db00d9458e 100644 --- a/.github/workflows/breeze.yml +++ b/.github/workflows/breeze.yml @@ -17,23 +17,23 @@ name: Breeze Linux Build on: push: branches: - - "main" + - main paths: - - "velox/experimental/breeze/**" - - "velox/external/perfetto/**" - - "CMake/**" - - "scripts/setup-ubuntu.sh" - - "scripts/setup-helper-functions.sh" - - ".github/workflows/breeze.yml" + - velox/experimental/breeze/** + - velox/external/perfetto/** + - CMake/** + - scripts/setup-ubuntu.sh + - scripts/setup-helper-functions.sh + - .github/workflows/breeze.yml pull_request: paths: - - "velox/experimental/breeze/**" - - "velox/external/perfetto/**" - - "CMake/**" - - "scripts/setup-ubuntu.sh" - - "scripts/setup-helper-functions.sh" - - ".github/workflows/breeze.yml" + - velox/experimental/breeze/** + - velox/external/perfetto/** + - CMake/** + - scripts/setup-ubuntu.sh + - scripts/setup-helper-functions.sh + - .github/workflows/breeze.yml permissions: contents: read @@ -47,7 +47,7 @@ jobs: runs-on: ubuntu-22.04 # prevent errors when forks ff their main branch if: ${{ github.repository == 'facebookincubator/velox' }} - name: "Ubuntu debug" + name: Ubuntu debug defaults: run: shell: bash @@ -57,6 +57,7 @@ jobs: - uses: actions/checkout@v4 with: path: velox + persist-credentials: false - name: Install Dependencies run: | @@ -82,9 +83,9 @@ jobs: runs-on: 4-core-ubuntu-gpu-t4 # prevent errors when forks ff their main branch if: ${{ github.repository == 'facebookincubator/velox' }} - name: "Ubuntu GPU debug" + name: Ubuntu GPU debug env: - CUDA_VERSION: "12.2" + CUDA_VERSION: '12.2' defaults: run: shell: bash @@ -94,6 +95,7 @@ jobs: - uses: actions/checkout@v4 with: path: velox + persist-credentials: false - name: Install Dependencies run: | diff --git a/.github/workflows/build-metrics.yml b/.github/workflows/build-metrics.yml index 3677fd20ad96..53c775962da8 100644 --- a/.github/workflows/build-metrics.yml +++ b/.github/workflows/build-metrics.yml @@ -16,19 +16,19 @@ name: Collect Build Metrics on: pull_request: - paths: - - ".github/workflows/build-metrics.yml" - - "scripts/ci/bm-report/**" + paths: + - .github/workflows/build-metrics.yml + - scripts/ci/bm-report/* workflow_dispatch: inputs: ref: - description: "ref to check" + description: ref to check required: true schedule: # Run every day at 04:05 - - cron: "5 4 * * *" + - cron: 5 4 * * * permissions: contents: read @@ -42,9 +42,9 @@ jobs: strategy: fail-fast: false matrix: - runner: ["16-core-ubuntu"] - type: ["debug", "release"] - link-type: ["shared", "static"] + runner: [16-core-ubuntu] + type: [debug, release] + link-type: [shared, static] defaults: run: shell: bash @@ -52,6 +52,7 @@ jobs: - uses: actions/checkout@v4 with: ref: ${{ inputs.ref || github.sha }} + persist-credentials: false - name: Fix git permissions # Usually actions/checkout does this but as we run in a container @@ -60,7 +61,7 @@ jobs: - name: Make ${{ matrix.link-type }} - ${{ matrix.type }} Build env: - MAKEFLAGS: 'MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=4' + MAKEFLAGS: MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=4 run: | EXTRA_CMAKE_FLAGS=( "-DVELOX_ENABLE_BENCHMARKS=ON" @@ -103,34 +104,35 @@ jobs: - uses: actions/upload-artifact@v4 with: path: ${{ env.sizes_file }} - name: "${{ matrix.type }}-${{ matrix.link-type }}-sizes" + name: ${{ matrix.type }}-${{ matrix.link-type }}-sizes - name: Copy ninja_log run: cp _build/${{ matrix.type }}/.ninja_log /tmp/metrics/.ninja_log - - name: "Install dependencies" + - name: Install dependencies run: | python3 -m pip install setuptools python3 -m pip install -r scripts/ci/benchmark-requirements.txt - - name: "Upload Metrics" + - name: Upload Metrics # This disables the upload and report generation on fork PRs but allows it for forks from within the main repo. if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == 'facebookincubator/velox' }} env: - CONBENCH_URL: "https://velox-conbench.voltrondata.run/" - CONBENCH_MACHINE_INFO_NAME: "GitHub-runner-${{ matrix.runner }}" - CONBENCH_EMAIL: "${{ secrets.CONBENCH_EMAIL }}" - CONBENCH_PASSWORD: "${{ secrets.CONBENCH_PASSWORD }}" + CONBENCH_URL: https://velox-conbench.voltrondata.run/ + CONBENCH_MACHINE_INFO_NAME: GitHub-runner-${{ matrix.runner }} + CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }} + CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASSWORD }} # These don't actually work https://github.com/conbench/conbench/issues/1484 # but have to be there to work regardless?? - CONBENCH_PROJECT_REPOSITORY: "${{ github.repository }}" - CONBENCH_PROJECT_COMMIT: "${{ inputs.ref || github.sha }}" + CONBENCH_PROJECT_REPOSITORY: ${{ github.repository }} + CONBENCH_PROJECT_COMMIT: ${{ inputs.ref || github.sha }} + REF: ${{ inputs.ref || github.sha }} run: | ./scripts/ci/bm-report/build-metrics.py upload \ --build_type "${{ matrix.link-type }}-${{ matrix.type }}" \ --run_id "BM-${{ matrix.link-type }}-${{ matrix.type }}-${{ github.run_id }}-${{ github.run_attempt }}" \ --pr_number "${{ github.event.number }}" \ - --sha "${{ inputs.ref || github.sha }}" \ + --sha "$REF" \ "/tmp/metrics" upload-report: @@ -144,6 +146,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: fetch-depth: 0 + persist-credentials: true - name: Setup Git User run: | @@ -161,9 +164,9 @@ jobs: - name: Build Documentation env: - CONBENCH_URL: "https://velox-conbench.voltrondata.run/" - CONBENCH_EMAIL: "${{ secrets.CONBENCH_EMAIL }}" - CONBENCH_PASSWORD: "${{ secrets.CONBENCH_PASSWORD }}" + CONBENCH_URL: https://velox-conbench.voltrondata.run/ + CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }} + CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASSWORD }} run: | cd scripts/ci/bm-report nix-shell --run "quarto render report.qmd" diff --git a/.github/workflows/build_pyvelox.yml b/.github/workflows/build_pyvelox.yml index e92f74ef929f..cf54e5087ce7 100644 --- a/.github/workflows/build_pyvelox.yml +++ b/.github/workflows/build_pyvelox.yml @@ -18,13 +18,13 @@ on: workflow_dispatch: inputs: version: - description: 'pyvelox version' + description: pyvelox version required: false ref: - description: 'git ref to build' + description: git ref to build required: false publish: - description: 'publish to PyPI' + description: publish to PyPI required: false type: boolean default: false @@ -32,7 +32,7 @@ on: # - cron: '15 0 * * *' pull_request: paths: - - '.github/workflows/build_pyvelox.yml' + - .github/workflows/build_pyvelox.yml permissions: contents: read @@ -54,61 +54,62 @@ jobs: with: ref: ${{ inputs.ref || github.ref }} fetch-depth: 0 - submodules: recursive + persist-credentials: false - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: '3.10' - - name: "Restore ccache" + - name: Restore ccache uses: apache/infrastructure-actions/stash/restore@3354c1565d4b0e335b78a76aedd82153a9e144d4 id: restore-cache with: - path: "ccache" + path: ccache key: ccache-wheels-${{ matrix.os }} - - name: "Restore macOS Dependencies" + - name: Restore macOS Dependencies if: startsWith(matrix.os, 'macos') uses: apache/infrastructure-actions/stash/restore@3354c1565d4b0e335b78a76aedd82153a9e144d4 id: restore-deps with: - path: "dependencies" + path: dependencies key: dependencies-pyvelox-${{ matrix.os }}-${{ hashFiles('scripts/setup-macos.sh') }} - name: Install macOS dependencies from brew if: startsWith(matrix.os, 'macos') run: | - export INSTALL_PREFIX="$GITHUB_WORKSPACE/dependencies" - echo "CMAKE_PREFIX_PATH=$INSTALL_PREFIX" >> $GITHUB_ENV - echo "DYLD_LIBRARY_PATH=$INSTALL_PREFIX/lib:$DYLD_LIBRARY_PATH" >> $GITHUB_ENV - - source scripts/setup-macos.sh - install_build_prerequisites - install_velox_deps_from_brew - # CMake 4.0 causes issues with arrows bundled dependencies - brew uninstall cmake - pipx install --force cmake==3.31 + export INSTALL_PREFIX="$GITHUB_WORKSPACE/dependencies" + echo "CMAKE_PREFIX_PATH=$INSTALL_PREFIX" >> $GITHUB_ENV + echo "INSTALL_PREFIX=$INSTALL_PREFIX" >> $GITHUB_ENV + echo "DYLD_LIBRARY_PATH=$INSTALL_PREFIX/lib:$DYLD_LIBRARY_PATH" >> $GITHUB_ENV + + source scripts/setup-macos.sh + install_build_prerequisites + install_velox_deps_from_brew + # CMake 4.0 causes issues with arrows bundled dependencies + brew uninstall cmake + pipx install --force cmake==3.31 - name: Install macOS dependencies if: ${{ startsWith(matrix.os, 'macos') && steps.restore-deps.outputs.stash-hit != 'true' }} env: - MACOSX_DEPLOYMENT_TARGET: "13.0" + MACOSX_DEPLOYMENT_TARGET: '13.0' run: | bash scripts/setup-macos.sh - - name: "Save macOS Dependencies" + - name: Save macOS Dependencies if: ${{ startsWith(matrix.os, 'macos') && steps.restore-deps.outputs.stash-hit != 'true' }} uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: "dependencies" + path: dependencies key: dependencies-pyvelox-${{ matrix.os }}-${{ hashFiles('scripts/setup-macos.sh') }} retention-days: 90 - - name: "Create sdist" + - name: Create sdist if: matrix.os == '8-core-ubuntu' env: - BUILD_VERSION: "${{ inputs.version }}" + BUILD_VERSION: ${{ inputs.version }} run: | pipx install uv uv build --sdist --out-dir wheelhouse @@ -116,23 +117,23 @@ jobs: - name: Build wheels uses: pypa/cibuildwheel@faf86a6ed7efa889faf6996aa23820831055001a # v2 env: - BUILD_VERSION: "${{ inputs.version }}" - CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-*" - CIBW_ENVIRONMENT_PASS_LINUX: "BUILD_VERSION" - CIBW_ENVIRONMENT_LINUX: "CCACHE_DIR=/host${{ github.workspace }}/ccache" + BUILD_VERSION: ${{ inputs.version }} + CIBW_BUILD: cp310-* cp311-* cp312-* cp313-* + CIBW_ENVIRONMENT_PASS_LINUX: BUILD_VERSION + CIBW_ENVIRONMENT_LINUX: CCACHE_DIR=/host${{ github.workspace }}/ccache # for macos MACOSX_DEPLOYMENT_TARGET: ${{ matrix.os == 'macos-14' && '14' || '13' }} - CCACHE_DIR: "${{ github.workspace }}/ccache" + CCACHE_DIR: ${{ github.workspace }}/ccache with: output-dir: wheelhouse - - name: "Save ccache" + - name: Save ccache uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: "ccache" + path: ccache key: ccache-wheels-${{ matrix.os }} - - name: "Rename wheel compatibility tag" + - name: Rename wheel compatibility tag if: false #startsWith(matrix.os, 'macos') run: | brew install rename @@ -163,7 +164,7 @@ jobs: - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: "3.10" + python-version: '3.10' - name: Publish a Python distribution to PyPI uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 7a7202498ea2..f8e86d321c25 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -16,7 +16,6 @@ name: Build & Push Docker Images on: pull_request: paths: - - scripts/docker/*.dockfile - scripts/docker/*.dockerfile - scripts/setup-*.sh - .github/workflows/docker.yml @@ -24,7 +23,6 @@ on: push: branches: [main] paths: - - scripts/docker/*.dockfile - scripts/docker/*.dockerfile - scripts/setup-*.sh - .github/workflows/docker.yml @@ -35,90 +33,90 @@ concurrency: permissions: contents: read - packages: write jobs: linux: - name: "Build and Push ${{ matrix.name }}" + name: Build and Push ${{ matrix.name }} runs-on: ubuntu-latest + permissions: + packages: write strategy: fail-fast: false matrix: include: - - name: Check - file: "scripts/docker/check-container.dockfile" - tags: "ghcr.io/facebookincubator/velox-dev:check" - name: Centos 9 - file: "scripts/docker/centos.dockerfile" - tags: "ghcr.io/facebookincubator/velox-dev:centos9" - - name: Pyvelox - file: "scripts/docker/pyvelox.dockerfile" - tags: "ghcr.io/facebookincubator/velox-dev:pyvelox" + file: scripts/docker/centos.dockerfile + tags: ghcr.io/facebookincubator/velox-dev:centos9 + - name: Pyvelox + file: scripts/docker/pyvelox.dockerfile + tags: ghcr.io/facebookincubator/velox-dev:pyvelox - name: Dev - file: "scripts/docker/ubuntu-22.04-cpp.dockerfile" - args: "" - tags: "ghcr.io/facebookincubator/velox-dev:ubuntu-22.04" + file: scripts/docker/ubuntu-22.04-cpp.dockerfile + args: '' + tags: ghcr.io/facebookincubator/velox-dev:ubuntu-22.04 steps: - name: Login to GitHub Container Registry - uses: docker/login-action@v2 + uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc # v2.2.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 - name: Build and Push - uses: docker/build-push-action@v6 + uses: docker/build-push-action@1dc73863535b631f98b2378be8619f83b136f4a0 # v6.17.0 with: - file: "${{ matrix.file }}" - build-args: "${{ matrix.args }}" + file: ${{ matrix.file }} + build-args: ${{ matrix.args }} push: ${{ github.repository == 'facebookincubator/velox' && github.event_name != 'pull_request'}} - tags: "${{ matrix.tags }}" + tags: ${{ matrix.tags }} linux-needs: - name: "Build and Push ${{ matrix.name }}" + name: Build and Push ${{ matrix.name }} needs: linux runs-on: ubuntu-latest + permissions: + packages: write strategy: fail-fast: false matrix: include: - name: Adapters - file: "scripts/docker/adapters.dockerfile" - tags: "ghcr.io/facebookincubator/velox-dev:adapters" + file: scripts/docker/adapters.dockerfile + tags: ghcr.io/facebookincubator/velox-dev:adapters - name: Presto Java - file: "scripts/docker/prestojava-container.dockerfile" - args: "PRESTO_VERSION=0.290" - tags: "ghcr.io/facebookincubator/velox-dev:presto-java" + file: scripts/docker/prestojava-container.dockerfile + args: PRESTO_VERSION=0.290 + tags: ghcr.io/facebookincubator/velox-dev:presto-java - name: Spark server - file: "scripts/docker/spark-container.dockerfile" - args: "SPARK_VERSION=3.5.1" - tags: "ghcr.io/facebookincubator/velox-dev:spark-server" + file: scripts/docker/spark-container.dockerfile + args: SPARK_VERSION=3.5.1 + tags: ghcr.io/facebookincubator/velox-dev:spark-server steps: - name: Login to GitHub Container Registry - uses: docker/login-action@v2 + uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc # v2.2.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 - name: Build and Push - uses: docker/build-push-action@v6 + uses: docker/build-push-action@1dc73863535b631f98b2378be8619f83b136f4a0 # v6.17.0 with: - file: "${{ matrix.file }}" - build-args: "${{ matrix.args }}" + file: ${{ matrix.file }} + build-args: ${{ matrix.args }} push: ${{ github.repository == 'facebookincubator/velox' && github.event_name != 'pull_request'}} - tags: "${{ matrix.tags }}" + tags: ${{ matrix.tags }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 1a799f313ed1..80ebde553ad5 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,13 +17,13 @@ name: Update Documentation on: push: paths: - - "velox/docs/**" - - ".github/workflows/docs.yml" + - velox/docs/** + - .github/workflows/docs.yml pull_request: paths: - - "velox/docs/**" - - ".github/workflows/docs.yml" + - velox/docs/** + - .github/workflows/docs.yml permissions: contents: write @@ -38,15 +38,15 @@ jobs: runs-on: 8-core-ubuntu container: ghcr.io/facebookincubator/velox-dev:centos9 env: - CCACHE_DIR: "/tmp/ccache" + CCACHE_DIR: /tmp/ccache steps: - - name: "Restore ccache" + - name: Restore ccache if: false uses: apache/infrastructure-actions/stash/restore@3354c1565d4b0e335b78a76aedd82153a9e144d4 id: restore-cache with: - path: "${{ env.CCACHE_DIR }}" + path: ${{ env.CCACHE_DIR }} key: ccache-wheels-8-core-ubuntu - name: Checkout @@ -67,15 +67,15 @@ jobs: which uv make python-venv which uv - uv pip install -r scripts/docs-requirements.txt - + uv pip install -r scripts/docs-requirements.txt + # Install pyvelox to generate it's docs make python-build - - name: "Save ccache" + - name: Save ccache uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: "${{ env.CCACHE_DIR }}" + path: ${{ env.CCACHE_DIR }} key: ccache-wheels-8-core-ubuntu - name: Build Documentation diff --git a/.github/workflows/linux-build-base.yml b/.github/workflows/linux-build-base.yml index 3b691120531a..dfac7e1ddf08 100644 --- a/.github/workflows/linux-build-base.yml +++ b/.github/workflows/linux-build-base.yml @@ -18,7 +18,7 @@ on: workflow_call: inputs: use-clang: - description: 'Use Clang to compile the project.' + description: Use Clang to compile the project. default: false required: false type: boolean @@ -34,14 +34,17 @@ jobs: run: shell: bash env: - CCACHE_DIR: "${{ github.workspace }}/ccache" + CCACHE_DIR: ${{ github.workspace }}/ccache VELOX_DEPENDENCY_SOURCE: SYSTEM GTest_SOURCE: BUNDLED cudf_SOURCE: BUNDLED - CUDA_VERSION: "12.8" + CUDA_VERSION: '12.8' USE_CLANG: "${{ inputs.use-clang && 'true' || 'false' }}" steps: - uses: actions/checkout@v4 + with: + fetch-depth: 2 + persist-credentials: false - name: Fix git permissions # Usually actions/checkout does this but as we run in a container @@ -50,38 +53,40 @@ jobs: - name: Install Dependencies run: | - # Allows to install arbitrary cuda-version whithout needing to update - # docker container before. It simplifies testing new/different versions - if ! yum list installed cuda-nvcc-$(echo ${CUDA_VERSION} | tr '.' '-') 1>/dev/null; then - source scripts/setup-centos9.sh - install_cuda ${CUDA_VERSION} + if git diff --name-only HEAD^1 HEAD | grep -q "scripts/setup-"; then + # Overwrite old setup scripts with changed versions + cp scripts/setup-* / + + mkdir /tmp/build + cd /tmp/build + source /opt/rh/gcc-toolset-12/enable + # install basic deps + bash /setup-centos9.sh + + source /setup-centos9.sh + install_adapters + install_cuda $CUDA_VERSION + + cd / + rm -rf /tmp/build # cleanup to avoid issues with disk space fi - - name: Install Minio - run: | - MINIO_BINARY="minio-2022-05-26" - if [ ! -f /usr/local/bin/${MINIO_BINARY} ]; then - wget https://dl.min.io/server/minio/release/linux-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z -O ${MINIO_BINARY} - chmod +x ./${MINIO_BINARY} - mv ./${MINIO_BINARY} /usr/local/bin/ - fi - - - uses: assignUser/stash/restore@v1 + - uses: apache/infrastructure-actions/stash/restore@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: '${{ env.CCACHE_DIR }}' + path: ${{ env.CCACHE_DIR }} key: ccache-linux-adapters-${{ inputs.use-clang && 'clang' || 'gcc' }} - - name: "Zero Ccache Statistics" + - name: Zero Ccache Statistics run: | ccache -sz - name: Make Release Build env: - MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4' + MAKEFLAGS: NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4 CUDA_ARCHITECTURES: 70 CUDA_COMPILER: /usr/local/cuda-${CUDA_VERSION}/bin/nvcc # Set compiler to GCC 12 - CUDA_FLAGS: "-ccbin /opt/rh/gcc-toolset-12/root/usr/bin" + CUDA_FLAGS: -ccbin /opt/rh/gcc-toolset-12/root/usr/bin run: | EXTRA_CMAKE_FLAGS=( "-DVELOX_ENABLE_BENCHMARKS=ON" @@ -105,16 +110,16 @@ jobs: - name: Ccache after run: ccache -s - - uses: assignUser/stash/save@v1 + - uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: '${{ env.CCACHE_DIR }}' + path: ${{ env.CCACHE_DIR }} key: ccache-linux-adapters-${{ inputs.use-clang && 'clang' || 'gcc' }} - name: Run Tests # Some of the adapters dependencies are in the 'adapters' conda env shell: mamba run --no-capture-output -n adapters /usr/bin/bash -e {0} env: - LIBHDFS3_CONF: "${{ github.workspace }}/scripts/hdfs-client.xml" + LIBHDFS3_CONF: ${{ github.workspace }}/scripts/hdfs-client.xml working-directory: _build/release run: | export CLASSPATH=`/usr/local/hadoop/bin/hdfs classpath --glob` @@ -124,10 +129,10 @@ jobs: runs-on: 8-core-ubuntu-22.04 # prevent errors when forks ff their main branch if: ${{ github.repository == 'facebookincubator/velox' }} - name: "Ubuntu debug with resolve_dependency" + name: Ubuntu debug with resolve_dependency env: - CCACHE_DIR: "${{ github.workspace }}/ccache" - USE_CLANG: "${{ inputs.use-clang && 'true' || 'false' }}" + CCACHE_DIR: ${{ github.workspace }}/ccache + USE_CLANG: ${{ inputs.use-clang && 'true' || 'false' }} defaults: run: shell: bash @@ -135,19 +140,20 @@ jobs: steps: - name: Get Ccache Stash - uses: assignUser/stash/restore@v1 + uses: apache/infrastructure-actions/stash/restore@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: '${{ env.CCACHE_DIR }}' + path: ${{ env.CCACHE_DIR }} key: ccache-ubuntu-debug-default-${{ inputs.use-clang && 'clang' || 'gcc' }} - name: Ensure Stash Dirs Exists working-directory: ${{ github.workspace }} run: | - mkdir -p '${{ env.CCACHE_DIR }}' + mkdir -p "$CCACHE_DIR" - uses: actions/checkout@v4 with: path: velox + persist-credentials: false - name: Install Dependencies run: | @@ -161,8 +167,8 @@ jobs: env: VELOX_DEPENDENCY_SOURCE: BUNDLED ICU_SOURCE: SYSTEM - MAKEFLAGS: "NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=3" - EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_PARQUET=ON -DVELOX_ENABLE_EXAMPLES=ON" + MAKEFLAGS: NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=3 + EXTRA_CMAKE_FLAGS: -DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_PARQUET=ON -DVELOX_ENABLE_EXAMPLES=ON run: | if [[ "${USE_CLANG}" = "true" ]]; then export CC=/usr/bin/clang-15; export CXX=/usr/bin/clang++-15; fi make debug @@ -171,9 +177,9 @@ jobs: run: | ccache -vs - - uses: assignUser/stash/save@v1 + - uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: '${{ env.CCACHE_DIR }}' + path: ${{ env.CCACHE_DIR }} key: ccache-ubuntu-debug-default-${{ inputs.use-clang && 'clang' || 'gcc' }} - name: Run Tests diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index bca6e5f9498a..f91a404668d5 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -17,33 +17,31 @@ name: Linux Build using GCC on: push: branches: - - "main" + - main paths: - - "velox/**" - - "!velox/docs/**" - - "CMakeLists.txt" - - "CMake/**" - - "third_party/**" - - "scripts/setup-ubuntu.sh" - - "scripts/setup-common.sh" - - "scripts/setup-versions.sh" - - "scripts/setup-helper-functions.sh" - - ".github/workflows/linux-build.yml" - - ".github/workflows/linux-build-base.yml" + - velox/** + - '!velox/docs/**' + - CMakeLists.txt + - CMake/** + - scripts/setup-ubuntu.sh + - scripts/setup-common.sh + - scripts/setup-versions.sh + - scripts/setup-helper-functions.sh + - .github/workflows/linux-build.yml + - .github/workflows/linux-build-base.yml pull_request: paths: - - "velox/**" - - "!velox/docs/**" - - "CMakeLists.txt" - - "CMake/**" - - "third_party/**" - - "scripts/setup-common.sh" - - "scripts/setup-versions.sh" - - "scripts/setup-ubuntu.sh" - - "scripts/setup-helper-functions.sh" - - ".github/workflows/linux-build.yml" - - ".github/workflows/linux-build-base.yml" + - velox/** + - '!velox/docs/**' + - CMakeLists.txt + - CMake/** + - scripts/setup-ubuntu.sh + - scripts/setup-common.sh + - scripts/setup-versions.sh + - scripts/setup-helper-functions.sh + - .github/workflows/linux-build.yml + - .github/workflows/linux-build-base.yml permissions: contents: read diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 1f9135cd07c4..b760528b828e 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -17,23 +17,21 @@ name: macOS Build on: push: paths: - - "velox/**" - - "!velox/docs/**" - - "CMakeLists.txt" - - "CMake/**" - - "third_party/**" - - "scripts/setup-macos.sh" - - ".github/workflows/macos.yml" + - velox/** + - '!velox/docs/**' + - CMakeLists.txt + - CMake/** + - scripts/setup-macos.sh + - .github/workflows/macos.yml pull_request: paths: - - "velox/**" - - "!velox/docs/**" - - "CMakeLists.txt" - - "CMake/**" - - "third_party/**" - - "scripts/setup-macos.sh" - - ".github/workflows/macos.yml" + - velox/** + - '!velox/docs/**' + - CMakeLists.txt + - CMake/** + - scripts/setup-macos.sh + - .github/workflows/macos.yml permissions: contents: read @@ -45,7 +43,7 @@ concurrency: jobs: macos-build: if: ${{ github.repository == 'facebookincubator/velox' }} - name: "${{ matrix.os }}" + name: ${{ matrix.os }} strategy: fail-fast: false matrix: @@ -54,19 +52,20 @@ jobs: os: [macos-13, macos-14] runs-on: ${{ matrix.os }} env: - CCACHE_DIR: '${{ github.workspace }}/ccache' + CCACHE_DIR: ${{ github.workspace }}/ccache # The arm runners have only 7GB RAM - BUILD_TYPE: "${{ matrix.os == 'macos-14' && 'Release' || 'Debug' }}" - INSTALL_PREFIX: "/tmp/deps-install" + BUILD_TYPE: ${{ matrix.os == 'macos-14' && 'Release' || 'Debug' }} + INSTALL_PREFIX: /tmp/deps-install steps: - name: Checkout uses: actions/checkout@v4 with: - submodules: recursive + persist-credentials: false + - name: Install Dependencies env: - HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: "TRUE" - CMAKE_POLICY_VERSION_MINIMUM: "3.5" + HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: 'TRUE' + CMAKE_POLICY_VERSION_MINIMUM: '3.5' run: | source scripts/setup-macos.sh install_build_prerequisites @@ -79,35 +78,35 @@ jobs: pipx install --force cmake==3.31 - name: Cache ccache - uses: assignUser/stash/restore@v1 + uses: apache/infrastructure-actions/stash/restore@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: '${{ env.CCACHE_DIR }}' + path: ${{ env.CCACHE_DIR }} key: ccache-macos-1-${{ matrix.os }} - name: Configure Build env: fmt_SOURCE: BUNDLED #brew fmt11 is not supported - CMAKE_POLICY_VERSION_MINIMUM: "3.5" + CMAKE_POLICY_VERSION_MINIMUM: '3.5' run: | - ccache -sz -M 5Gi - cmake \ - -B _build/$BUILD_TYPE \ - -GNinja \ - -DTREAT_WARNINGS_AS_ERRORS=1 \ - -DENABLE_ALL_WARNINGS=1 \ - -DVELOX_ENABLE_PARQUET=ON \ - -DVELOX_MONO_LIBRARY=ON \ - -DVELOX_BUILD_SHARED=ON \ - -DCMAKE_BUILD_TYPE=$BUILD_TYPE + ccache -sz -M 5Gi + cmake \ + -B _build/$BUILD_TYPE \ + -GNinja \ + -DTREAT_WARNINGS_AS_ERRORS=1 \ + -DENABLE_ALL_WARNINGS=1 \ + -DVELOX_ENABLE_PARQUET=ON \ + -DVELOX_MONO_LIBRARY=ON \ + -DVELOX_BUILD_SHARED=ON \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE - name: Build run: | - cmake --build _build/$BUILD_TYPE -j $NJOBS - ccache -s + cmake --build _build/$BUILD_TYPE -j $NJOBS + ccache -s - - uses: assignUser/stash/save@v1 + - uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: '${{ env.CCACHE_DIR }}' + path: ${{ env.CCACHE_DIR }} key: ccache-macos-1-${{ matrix.os }} - name: Run Tests diff --git a/.github/workflows/preliminary_checks.yml b/.github/workflows/preliminary_checks.yml index 3e63212b5ec4..e18f9864da86 100644 --- a/.github/workflows/preliminary_checks.yml +++ b/.github/workflows/preliminary_checks.yml @@ -29,60 +29,39 @@ concurrency: cancel-in-progress: true jobs: - check-matrix: - name: ${{ matrix.config.name }} + pre-commit: runs-on: ubuntu-latest - container: ghcr.io/facebookincubator/velox-dev:check - strategy: - fail-fast: false - matrix: - config: - - { name: "License Header", - command: "header-fix", - message: "Found missing License Header(s)", - } - - { name: "Code Format", - command: "format-fix", - message: "Found format issues" - } steps: + - run: python -m pip install pre-commit + - uses: actions/checkout@v4 with: - fetch-depth: 0 + fetch-depth: 2 + persist-credentials: false - - name: Fix git permissions - # Usually actions/checkout does this but as we run in a container - # it doesn't work - run: git config --global --add safe.directory ${GITHUB_WORKSPACE} + - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: ~/.cache/pre-commit + key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} + restore-keys: | + pre-commit- - - name: Check ${{ matrix.config.name }} + - name: Run pre-commit + env: + GH_TOKEN: ${{ github.token }} run: | - make ${{ matrix.config.command }} - - if ! git diff --quiet; then - diff=`git --no-pager diff` - echo "${{ matrix.command.message }} in the following files:" - git --no-pager diff --name-only - echo "Check the Job summary for a copy-pasteable patch." - - echo "> [!IMPORTANT]" >> $GITHUB_STEP_SUMMARY - echo "${{ matrix.config.message }}" >> $GITHUB_STEP_SUMMARY - echo "> Please apply fix using:" >> $GITHUB_STEP_SUMMARY - echo "\`\`\`sh" >> $GITHUB_STEP_SUMMARY - echo "patch -p1 <> $GITHUB_STEP_SUMMARY - echo "$diff" >> $GITHUB_STEP_SUMMARY - echo "EOF" >> $GITHUB_STEP_SUMMARY - echo "\`\`\`" >> $GITHUB_STEP_SUMMARY - exit 1 - fi - + files=$(git diff --name-only HEAD^1 HEAD) + echo "::group::Changed files" + echo $files | tr ' ' '\n' + echo "::endgroup::" + pre-commit run --show-diff-on-failure --color=always --files $files title-check: name: PR Title Format runs-on: ubuntu-latest steps: - shell: python env: - title: "${{ github.event.pull_request.title }}" + title: '${{ github.event.pull_request.title }}' run: | import re import os diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml index b18b5be56e6c..c9332edfa6ed 100644 --- a/.github/workflows/scheduled.yml +++ b/.github/workflows/scheduled.yml @@ -12,60 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: "Fuzzer Jobs" +name: Fuzzer Jobs on: pull_request: paths: - - "velox/**" - - "!velox/docs/**" - - "CMakeLists.txt" - - "CMake/**" - - "third_party/**" - - "scripts/setup-ubuntu.sh" - - "scripts/setup-common.sh" - - "scripts/setup-versions.sh" - - "scripts/setup-helper-functions.sh" - - ".github/workflows/scheduled.yml" - - "setup.py" + - velox/** + - '!velox/docs/**' + - CMakeLists.txt + - CMake/** + - scripts/setup-ubuntu.sh + - scripts/setup-common.sh + - scripts/setup-versions.sh + - scripts/setup-helper-functions.sh + - .github/workflows/scheduled.yml + - pyproject.toml push: branches: - - "main" + - main paths: - - "velox/**" - - "!velox/docs/**" - - "CMakeLists.txt" - - "CMake/**" - - "third_party/**" - - "scripts/setup-ubuntu.sh" - - "scripts/setup-common.sh" - - "scripts/setup-versions.sh" - - "scripts/setup-helper-functions.sh" - - ".github/workflows/scheduled.yml" + - velox/** + - '!velox/docs/**' + - CMakeLists.txt + - CMake/** + - scripts/setup-ubuntu.sh + - scripts/setup-common.sh + - scripts/setup-versions.sh + - scripts/setup-helper-functions.sh + - .github/workflows/scheduled.yml + - pyproject.toml schedule: - - cron: '0 3 * * *' + - cron: 0 3 * * * workflow_dispatch: inputs: ref: - description: 'Ref to checkout out' - default: 'main' + description: Ref to checkout out + default: main numThreads: - description: 'Number of threads' + description: Number of threads default: 16 maxHighMemJobs: - description: 'Number of high memory jobs' + description: Number of high memory jobs default: 8 maxLinkJobs: - description: 'Maximum number of link jobs' + description: Maximum number of link jobs default: 4 extraCMakeFlags: - description: 'Additional CMake flags' + description: Additional CMake flags default: '' duration: - description: 'Duration of fuzzer run in seconds' + description: Duration of fuzzer run in seconds default: 1800 defaults: @@ -84,9 +83,9 @@ concurrency: env: # Run for 15 minute on PRs - DURATION: "${{ inputs.duration || ( github.event_name != 'schedule' && 900 || 1800 )}}" + DURATION: ${{ inputs.duration || ( github.event_name != 'schedule' && 900 || 1800 )}} # minimize artifact duration for PRs, keep them a bit longer for nightly runs - RETENTION: "${{ github.event_name == 'pull_request' && 1 || 3 }}" + RETENTION: ${{ github.event_name == 'pull_request' && 1 || 3 }} jobs: compile: @@ -97,12 +96,12 @@ jobs: container: ghcr.io/facebookincubator/velox-dev:centos9 timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/ccache" - NUM_THREADS: "${{ inputs.numThreads || 16 }}" - MAX_HIGH_MEM_JOBS: "${{ inputs.maxHighMemJobs || 8 }}" - MAX_LINK_JOBS: "${{ inputs.maxLinkJobs || 4 }}" - SKBUILD_BUILD_DIR: "_build/debug" - PYVELOX_LEGACY_ONLY: "ON" + CCACHE_DIR: ${{ github.workspace }}/ccache + NUM_THREADS: ${{ inputs.numThreads || 16 }} + MAX_HIGH_MEM_JOBS: ${{ inputs.maxHighMemJobs || 8 }} + MAX_LINK_JOBS: ${{ inputs.maxLinkJobs || 4 }} + SKBUILD_BUILD_DIR: _build/debug + PYVELOX_LEGACY_ONLY: 'ON' defaults: run: @@ -134,22 +133,22 @@ jobs: echo "head_main=$head_main" >> $GITHUB_OUTPUT - name: Get Function Signature Stash - uses: assignUser/stash/restore@v1 + uses: apache/infrastructure-actions/stash/restore@3354c1565d4b0e335b78a76aedd82153a9e144d4 id: get-sig with: path: /tmp/signatures key: function-signatures-${{ steps.get-head.outputs.head_main || github.sha }} - name: Restore ccache - uses: assignUser/stash/restore@v1 + uses: apache/infrastructure-actions/stash/restore@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: "${{ env.CCACHE_DIR }}" + path: ${{ env.CCACHE_DIR }} key: ccache-fuzzer-centos - name: Fix git permissions working-directory: ${{ github.workspace }} - # Usually actions/checkout does this but as we run in a container - # it doesn't work + # Usually actions/checkout does this but as we run in a container + # it doesn't work run: | git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox_main @@ -162,10 +161,11 @@ jobs: - name: Checkout Main if: ${{ github.event_name != 'schedule' && steps.get-sig.outputs.stash-hit != 'true' }} - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: ref: ${{ steps.get-head.outputs.head_main || 'main' }} path: velox_main + persist-credentials: false - name: Build PyVelox if: ${{ github.event_name != 'schedule' && steps.get-sig.outputs.stash-hit != 'true' }} @@ -185,17 +185,17 @@ jobs: - name: Save Function Signature Stash if: ${{ github.event_name == 'pull_request' && steps.get-sig.outputs.stash-hit != 'true' }} - uses: assignUser/stash/save@v1 + uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: path: /tmp/signatures key: function-signatures-${{ steps.get-head.outputs.head_main }} - name: Checkout Contender - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Zero Ccache Statistics run: | @@ -214,7 +214,6 @@ jobs: source .venv/bin/activate make debug - - name: Ccache after run: ccache -s @@ -222,9 +221,9 @@ jobs: # see https://github.com/actions/upload-artifact/issues/543 continue-on-error: true if: ${{ github.event_name != 'schedule' }} - uses: assignUser/stash/save@v1 + uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: - path: "${{ env.CCACHE_DIR }}" + path: ${{ env.CCACHE_DIR }} key: ccache-fuzzer-centos - name: Build PyVelox @@ -246,11 +245,11 @@ jobs: - name: Upload Signature Artifacts if: ${{ github.event_name != 'schedule' }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: signatures path: /tmp/signatures - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Prepare signatures working-directory: /tmp/signatures @@ -266,101 +265,101 @@ jobs: - name: Save Function Signature Stash if: ${{ github.event_name == 'push' }} - uses: assignUser/stash/save@v1 + uses: apache/infrastructure-actions/stash/save@3354c1565d4b0e335b78a76aedd82153a9e144d4 with: path: /tmp/signatures key: function-signatures-${{ github.sha }} - name: Upload presto fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto path: velox/_build/debug/velox/expression/fuzzer/velox_expression_fuzzer_test - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload spark expression fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: spark_expression_fuzzer path: velox/_build/debug/velox/expression/fuzzer/spark_expression_fuzzer_test - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload spark aggregation fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: spark_aggregation_fuzzer path: velox/_build/debug/velox/functions/sparksql/fuzzer/spark_aggregation_fuzzer_test - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload aggregation fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: aggregation path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload join fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: join path: velox/_build/debug/velox/exec/fuzzer/velox_join_fuzzer - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload exchange fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: exchange path: velox/_build/debug//velox/exec/fuzzer/velox_exchange_fuzzer - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload window fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: window path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_window_fuzzer_test - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload cache fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: cache_fuzzer path: velox/_build/debug/velox/exec/fuzzer/velox_cache_fuzzer - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload table evolution fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: table_evolution_fuzzer path: velox/_build/debug/velox/exec/tests/velox_table_evolution_fuzzer_test - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload memory arbitration fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: memory_arbitration_fuzzer - path: velox/_build/debug/velox/exec/tests/velox_memory_arbitration_fuzzer_test - retention-days: "${{ env.RETENTION }}" + path: velox/_build/debug/velox/exec/fuzzer/velox_memory_arbitration_fuzzer + retention-days: ${{ env.RETENTION }} - name: Upload row number fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: row_number path: velox/_build/debug//velox/exec/fuzzer/velox_row_number_fuzzer - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload topn row number fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: topn_row_number path: velox/_build/debug//velox/exec/fuzzer/velox_topn_row_number_fuzzer - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} - name: Upload writer fuzzer - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: writer path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_writer_fuzzer_test - retention-days: "${{ env.RETENTION }}" + retention-days: ${{ env.RETENTION }} presto-fuzzer-run: name: Presto Fuzzer @@ -370,8 +369,7 @@ jobs: needs: compile timeout-minutes: 120 steps: - - - uses: dorny/paths-filter@v3 + - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 if: github.event_name == 'pull_request' id: changes with: @@ -386,12 +384,11 @@ jobs: - name: Set presto specific fuzzer duration env: # Run for 30 minutes instead of 15, when files relevant to presto are touched - pr_duration: "${{ steps.changes.outputs.presto == 'true' && 1800 || 900 }}" + pr_duration: ${{ steps.changes.outputs.presto == 'true' && 1800 || 900 }} # Run for 60 minutes if its a scheduled run - other_duration: "${{ inputs.duration || (github.event_name == 'push' && 1800 || 3600) }}" - is_pr: "${{ github.event_name == 'pull_request' }}" + other_duration: ${{ inputs.duration || (github.event_name == 'push' && 1800 || 3600) }} + is_pr: ${{ github.event_name == 'pull_request' }} run: | - if [ "$is_pr" == "true" ]; then duration=$pr_duration else @@ -401,7 +398,7 @@ jobs: echo "DURATION=$duration" >> $GITHUB_ENV - name: Download presto fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: presto @@ -432,7 +429,7 @@ jobs: - name: Archive production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto-fuzzer-failure-artifacts path: | @@ -448,12 +445,12 @@ jobs: steps: - name: Download presto expression fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: presto - name: Download Signatures - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: signatures path: /tmp/signatures @@ -487,7 +484,7 @@ jobs: - name: Archive Spark expression production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto-bias-fuzzer-failure-artifacts path: | @@ -502,7 +499,7 @@ jobs: steps: - name: Download spark aggregation fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: spark_aggregation_fuzzer @@ -524,7 +521,7 @@ jobs: - name: Archive Spark aggregate production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: spark-agg-fuzzer-failure-artifacts path: | @@ -540,12 +537,12 @@ jobs: steps: - name: Download spark expression fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: spark_expression_fuzzer - name: Download Signatures - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: signatures path: /tmp/signatures @@ -570,7 +567,7 @@ jobs: - name: Archive Spark expression production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: spark-fuzzer-failure-artifacts path: | @@ -586,7 +583,7 @@ jobs: steps: - name: Download spark expression fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: spark_expression_fuzzer @@ -615,7 +612,7 @@ jobs: - name: Archive Spark expression production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: spark-fuzzer-failure-artifacts path: | @@ -628,24 +625,24 @@ jobs: needs: compile timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - LINUX_DISTRO: "centos" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + LINUX_DISTRO: centos steps: - name: Download join fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: join - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Checkout Repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Fix git permissions - # Usually actions/checkout does this but as we run in a container + # Usually actions/checkout does this but as we run in a container # it doesn't work run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox @@ -676,7 +673,7 @@ jobs: - name: Archive join production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto-sot-join-fuzzer-failure-artifacts path: | @@ -692,7 +689,7 @@ jobs: steps: - name: Download exchange fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: exchange @@ -715,11 +712,11 @@ jobs: - name: Archive Exchange production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: exchange-fuzzer-failure-artifacts - path: | - /tmp/exchange_fuzzer_repro + name: exchange-fuzzer-failure-artifacts + path: | + /tmp/exchange_fuzzer_repro presto-java-row-number-fuzzer-run: name: RowNumber Fuzzer @@ -728,22 +725,22 @@ jobs: needs: compile timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - LINUX_DISTRO: "centos" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + LINUX_DISTRO: centos steps: - name: Download row number fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: row_number - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Checkout Repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Fix git permissions # Usually actions/checkout does this but as we run in a container @@ -777,11 +774,11 @@ jobs: - name: Archive row number production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: row-fuzzer-failure-artifacts - path: | - /tmp/row_fuzzer_repro + name: row-fuzzer-failure-artifacts + path: | + /tmp/row_fuzzer_repro presto-java-topn-row-number-fuzzer-run: name: TopNRowNumber Fuzzer @@ -790,22 +787,22 @@ jobs: needs: compile timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - LINUX_DISTRO: "centos" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + LINUX_DISTRO: centos steps: - name: Download topn row number fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: topn_row_number - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Checkout Repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Fix git permissions # Usually actions/checkout does this but as we run in a container @@ -840,7 +837,7 @@ jobs: - name: Archive topn row number production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: topn-row-fuzzer-failure-artifacts path: | @@ -857,7 +854,7 @@ jobs: steps: - name: Download cache fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: cache_fuzzer @@ -876,11 +873,11 @@ jobs: - name: Archive Cache production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: cache-fuzzer-logs - path: | - /tmp/cache_fuzzer + name: cache-fuzzer-logs + path: | + /tmp/cache_fuzzer table-evolution-fuzzer-run: name: Table Evolution Fuzzer @@ -891,7 +888,7 @@ jobs: steps: - name: Download table evolution fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: table_evolution_fuzzer @@ -910,11 +907,11 @@ jobs: - name: Archive table evolution production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: table-evolution-fuzzer-test-logs - path: | - /tmp/table_evolution_fuzzer_test + name: table-evolution-fuzzer-test-logs + path: | + /tmp/table_evolution_fuzzer_test memory-arbitration-fuzzer-run: name: Memory Arbitration Fuzzer @@ -925,30 +922,30 @@ jobs: steps: - name: Download memory arbitration fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: memory_arbitration_fuzzer - name: Run Memory Arbitration Fuzzer run: | - mkdir -p /tmp/memory_arbitration_fuzzer_test/logs/ - chmod -R 777 /tmp/memory_arbitration_fuzzer_test - chmod +x velox_memory_arbitration_fuzzer_test - ./velox_memory_arbitration_fuzzer_test \ + mkdir -p /tmp/memory_arbitration_fuzzer/logs/ + chmod -R 777 /tmp/memory_arbitration_fuzzer + chmod +x velox_memory_arbitration_fuzzer + ./velox_memory_arbitration_fuzzer \ --seed ${RANDOM} \ --duration_sec $DURATION \ --minloglevel=0 \ --stderrthreshold=2 \ - --log_dir=/tmp/memory_arbitration_fuzzer_test/logs \ + --log_dir=/tmp/memory_arbitration_fuzzer/logs \ && echo -e "\n\Memory arbitration fuzzer run finished successfully." - name: Archive memory arbitration production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: memory-arbitration-fuzzer-test-logs - path: | - /tmp/memory_arbitration_fuzzer_test + name: memory-arbitration-fuzzer-test-logs + path: | + /tmp/memory_arbitration_fuzzer_test presto-java-aggregation-fuzzer-run: name: Aggregation Fuzzer with Presto as source of truth @@ -957,29 +954,28 @@ jobs: container: ghcr.io/facebookincubator/velox-dev:presto-java timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - LINUX_DISTRO: "centos" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + LINUX_DISTRO: centos steps: - name: Download aggregation fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: aggregation - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Checkout Repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Fix git permissions - # Usually actions/checkout does this but as we run in a container + # Usually actions/checkout does this but as we run in a container # it doesn't work run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox - - - name: "Run Aggregate Fuzzer" + - name: Run Aggregate Fuzzer run: | cd velox cp ./scripts/ci/presto/etc/hive.properties $PRESTO_HOME/etc/catalog @@ -1007,14 +1003,13 @@ jobs: - name: Archive aggregate production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto-sot-aggregate-fuzzer-failure-artifacts path: | /tmp/aggregate_fuzzer_repro /tmp/server.log - presto-java-only-bias-function-expression-fuzzer-run: name: Biased Expression Fuzzer with Only Added/Updated Functions and Presto as source of truth needs: compile @@ -1023,21 +1018,21 @@ jobs: timeout-minutes: 120 if: ${{ needs.compile.outputs.presto_bias == 'true' }} env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - LINUX_DISTRO: "centos" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + LINUX_DISTRO: centos steps: - name: Download presto expression fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: presto - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Checkout Repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Fix git permissions # Usually actions/checkout does this but as we run in a container @@ -1045,7 +1040,7 @@ jobs: run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox - name: Download Signatures - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: signatures path: /tmp/signatures @@ -1098,7 +1093,7 @@ jobs: && echo -e "\n\nPresto Fuzzer run finished successfully." - name: Archive Presto only-bias-function expression fuzzer production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto-only-bias-function-fuzzer-failure-artifacts path: | @@ -1112,21 +1107,21 @@ jobs: timeout-minutes: 120 if: ${{ needs.compile.outputs.presto_aggregate_bias == 'true' }} env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - LINUX_DISTRO: "centos" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + LINUX_DISTRO: centos steps: - name: Download aggregation fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: aggregation - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Checkout Repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Fix git permissions # Usually actions/checkout does this but as we run in a container @@ -1134,12 +1129,12 @@ jobs: run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox - name: Download Signatures - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: signatures path: /tmp/signatures - - name: "Run Bias Aggregate Fuzzer" + - name: Run Bias Aggregate Fuzzer run: | cd velox cp ./scripts/ci/presto/etc/hive.properties $PRESTO_HOME/etc/catalog @@ -1172,7 +1167,7 @@ jobs: - name: Archive bias aggregate production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto-bias-sot-aggregate-fuzzer-failure-artifacts path: | @@ -1186,7 +1181,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download Signatures - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: signatures path: /tmp/signatures @@ -1210,29 +1205,28 @@ jobs: container: ghcr.io/facebookincubator/velox-dev:presto-java timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - LINUX_DISTRO: "centos" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + LINUX_DISTRO: centos steps: - name: Download window fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: window - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Checkout Repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Fix git permissions - # Usually actions/checkout does this but as we run in a container + # Usually actions/checkout does this but as we run in a container # it doesn't work run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox - - - name: "Run Window Fuzzer" + - name: Run Window Fuzzer run: | cd velox cp ./scripts/ci/presto/etc/hive.properties $PRESTO_HOME/etc/catalog @@ -1261,7 +1255,7 @@ jobs: - name: Archive window production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto-sot-window-fuzzer-failure-artifacts path: | @@ -1275,28 +1269,28 @@ jobs: container: ghcr.io/facebookincubator/velox-dev:presto-java timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/ccache/" - LINUX_DISTRO: "centos" + CCACHE_DIR: ${{ github.workspace }}/ccache/ + LINUX_DISTRO: centos steps: - name: Download writer fuzzer - uses: actions/download-artifact@v4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: name: writer - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Checkout Repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" + ref: ${{ inputs.ref }} + persist-credentials: false - name: Fix git permissions # Usually actions/checkout does this but as we run in a container # it doesn't work run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox - - name: "Run Writer Fuzzer" + - name: Run Writer Fuzzer run: | cd velox cp ./scripts/ci/presto/etc/hive.properties $PRESTO_HOME/etc/catalog @@ -1327,7 +1321,7 @@ jobs: - name: Archive writer production artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: presto-sot-writer-fuzzer-failure-artifacts path: | diff --git a/scripts/docker/check-container.dockfile b/.github/zizmor.yml similarity index 82% rename from scripts/docker/check-container.dockfile rename to .github/zizmor.yml index 9240a97dcd8c..a1baabf91cf8 100644 --- a/scripts/docker/check-container.dockfile +++ b/.github/zizmor.yml @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -FROM amd64/ubuntu:24.04 -COPY scripts/setup-check.sh /root -COPY scripts/setup-helper-functions.sh / -RUN bash /root/setup-check.sh +rules: + use-trusted-publishing: + ignore: + - build_pyvelox.yml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000000..c902b08ed1ff --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See https://pre-commit.com for more information + +# General excludes, files can also be excluded on a hook level +exclude: .*\.patch|scripts/tests/.*|velox/external/.*|CMake/third-party/.* +default_install_hook_types: [pre-commit, pre-push] +repos: + - repo: meta + hooks: + - id: check-hooks-apply + - id: check-useless-excludes + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + - id: end-of-file-fixer + - id: check-added-large-files + - id: check-executables-have-shebangs + - id: check-shebang-scripts-are-executable + + - repo: local + hooks: + - id: cmake-format + name: cmake-format + description: Format CMake files. + entry: cmake-format + language: python + files: (CMakeLists.*|.*\.cmake|.*\.cmake.in)$ + args: [--in-place] + require_serial: false + additional_dependencies: [cmake-format==0.6.13, pyyaml] + + - id: clang-tidy + name: clang-tidy + description: Run clang-tidy on C/C++ files + stages: + - manual # Needs compile_commands.json + entry: clang-tidy + language: python + types_or: [c++, c] + additional_dependencies: [clang-tidy==18.1.8] + require_serial: true + + - id: license-header + name: license-header + description: Add missing license headers. + entry: ./scripts/checks/license-header.py + args: [-i] + language: python + additional_dependencies: [regex] + require_serial: true + exclude: | + (?x)^( + CMake/Find(Snappy|Sodium|Thrift|double-conversion)\.cmake| + velox/docs/affiliations_map.txt| + velox/.*/bitpacking\.(cpp|h)| + velox/.*/Lemire/.*| + velox/.*/gpu/CudaMemMeter.cu| + velox/.*/coverage/data/.*| + velox/tpch/gen/dbgen/.*| + NOTICE.txt + )$ + + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v18.1.3 + hooks: + - id: clang-format + # types_or: [c++, c, cuda, metal, objective-c] + files: \.(cpp|cc|c|h|hpp|inc|cu|cuh|clcpp|mm|metal)$ + + # Python + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.9 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + # The following checks mostly target GitHub Actions workflows. + - repo: https://github.com/adrienverge/yamllint.git + rev: v1.37.0 + hooks: + - id: yamllint + args: [--format, parsable, --strict] + exclude: .*\.clang-(tidy|format) + + - repo: https://github.com/google/yamlfmt + rev: v0.16.0 + hooks: + - id: yamlfmt + exclude: .*\.clang-(tidy|format) + + - repo: https://github.com/zizmorcore/zizmor-pre-commit + rev: v1.7.0 + hooks: + - id: zizmor + + - repo: https://github.com/mpalmer/action-validator + rev: 2f8be1d2066eb3687496a156d00b4f1b3ea7b028 + hooks: + - id: action-validator diff --git a/scripts/setup-check.sh b/.yamlfmt.yml similarity index 59% rename from scripts/setup-check.sh rename to .yamlfmt.yml index d3d6573a8eda..f25bdcc6c46a 100644 --- a/scripts/setup-check.sh +++ b/.yamlfmt.yml @@ -1,4 +1,3 @@ -#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -set -e -set -x - -export DEBIAN_FRONTEND=noninteractive -apt update -apt install --no-install-recommends -y clang-format-18 python3-pip git make ssh -pip3 install --break-system-packages cmake==3.28.3 cmake_format black pyyaml regex -pip3 cache purge -apt purge --auto-remove -y python3-pip -update-alternatives --install /usr/bin/clang-format clang-format "$(command -v clang-format-18)" 18 -apt clean +match_type: doublestar +exclude: + - '**/.clang-format' + - '**/.clang-tidy' +formatter: + type: basic + retain_line_breaks_single: true + scan_folded_as_literal: true + indent: 2 diff --git a/.yamllint.yml b/.yamllint.yml new file mode 100644 index 000000000000..390f9f475021 --- /dev/null +++ b/.yamllint.yml @@ -0,0 +1,46 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +rules: + braces: + min-spaces-inside: 0 + max-spaces-inside: 1 + min-spaces-inside-empty: 0 + max-spaces-inside-empty: 0 + brackets: + min-spaces-inside: 0 + max-spaces-inside: 0 + min-spaces-inside-empty: 0 + max-spaces-inside-empty: 0 + comments: disable + comments-indentation: disable + document-end: disable + document-start: disable + empty-lines: disable + empty-values: + forbid-in-flow-mappings: true + forbid-in-block-sequences: true + float-values: + forbid-inf: true + forbid-nan: true + forbid-scientific-notation: true + require-numeral-before-decimal: true + indentation: disable + line-length: disable + octal-values: enable + quoted-strings: + required: only-when-needed + extra-allowed: ['.*\$\{\{.*\}\}.*'] + truthy: + allowed-values: ['true', 'false', 'on'] + level: warning diff --git a/CMake/FindSodium.cmake b/CMake/FindSodium.cmake index c486ac112b8e..68ea1f96550c 100644 --- a/CMake/FindSodium.cmake +++ b/CMake/FindSodium.cmake @@ -267,15 +267,17 @@ if(NOT TARGET sodium) endif() set_target_properties( - sodium PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${sodium_INCLUDE_DIR}" - IMPORTED_LINK_INTERFACE_LANGUAGES "C") + sodium + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${sodium_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "C") if(sodium_USE_STATIC_LIBS) set_target_properties( sodium - PROPERTIES INTERFACE_COMPILE_DEFINITIONS "SODIUM_STATIC" - IMPORTED_LOCATION "${sodium_LIBRARY_RELEASE}" - IMPORTED_LOCATION_DEBUG "${sodium_LIBRARY_DEBUG}") + PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "SODIUM_STATIC" + IMPORTED_LOCATION "${sodium_LIBRARY_RELEASE}" + IMPORTED_LOCATION_DEBUG "${sodium_LIBRARY_DEBUG}") else() if(UNIX) set_target_properties( @@ -292,9 +294,10 @@ else() if(NOT (sodium_DLL_RELEASE MATCHES ".*-NOTFOUND")) set_target_properties( sodium - PROPERTIES IMPORTED_LOCATION_RELWITHDEBINFO "${sodium_DLL_RELEASE}" - IMPORTED_LOCATION_MINSIZEREL "${sodium_DLL_RELEASE}" - IMPORTED_LOCATION_RELEASE "${sodium_DLL_RELEASE}") + PROPERTIES + IMPORTED_LOCATION_RELWITHDEBINFO "${sodium_DLL_RELEASE}" + IMPORTED_LOCATION_MINSIZEREL "${sodium_DLL_RELEASE}" + IMPORTED_LOCATION_RELEASE "${sodium_DLL_RELEASE}") endif() endif() endif() diff --git a/CMake/Findglog.cmake b/CMake/Findglog.cmake index 752647cb3357..81deadb36442 100644 --- a/CMake/Findglog.cmake +++ b/CMake/Findglog.cmake @@ -1,4 +1,17 @@ # Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # - Try to find Glog # Once done, this will define # @@ -9,29 +22,26 @@ include(FindPackageHandleStandardArgs) include(SelectLibraryConfigurations) -find_library(GLOG_LIBRARY_RELEASE glog - PATHS ${GLOG_LIBRARYDIR}) -find_library(GLOG_LIBRARY_DEBUG glogd - PATHS ${GLOG_LIBRARYDIR}) +find_library(GLOG_LIBRARY_RELEASE glog PATHS ${GLOG_LIBRARYDIR}) +find_library(GLOG_LIBRARY_DEBUG glogd PATHS ${GLOG_LIBRARYDIR}) -find_path(GLOG_INCLUDE_DIR glog/logging.h - PATHS ${GLOG_INCLUDEDIR}) +find_path(GLOG_INCLUDE_DIR glog/logging.h PATHS ${GLOG_INCLUDEDIR}) select_library_configurations(GLOG) -find_package_handle_standard_args(glog DEFAULT_MSG - GLOG_LIBRARY - GLOG_INCLUDE_DIR) +find_package_handle_standard_args(glog DEFAULT_MSG GLOG_LIBRARY + GLOG_INCLUDE_DIR) -mark_as_advanced( - GLOG_LIBRARY - GLOG_INCLUDE_DIR) +mark_as_advanced(GLOG_LIBRARY GLOG_INCLUDE_DIR) set(GLOG_LIBRARIES ${GLOG_LIBRARY}) set(GLOG_INCLUDE_DIRS ${GLOG_INCLUDE_DIR}) -if (NOT TARGET glog::glog) +if(NOT TARGET glog::glog) add_library(glog::glog UNKNOWN IMPORTED) - set_target_properties(glog::glog PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${GLOG_INCLUDE_DIRS}") - set_target_properties(glog::glog PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "C" IMPORTED_LOCATION "${GLOG_LIBRARIES}") + set_target_properties(glog::glog PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${GLOG_INCLUDE_DIRS}") + set_target_properties( + glog::glog PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LOCATION "${GLOG_LIBRARIES}") endif() diff --git a/CMake/Findlz4.cmake b/CMake/Findlz4.cmake index d49115f12740..d13c951b8898 100644 --- a/CMake/Findlz4.cmake +++ b/CMake/Findlz4.cmake @@ -1,4 +1,17 @@ # Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # - Try to find lz4 # Once done, this will define # diff --git a/CMake/Findlzo2.cmake b/CMake/Findlzo2.cmake index c263f5926c03..9f9fbbbe11ca 100644 --- a/CMake/Findlzo2.cmake +++ b/CMake/Findlzo2.cmake @@ -1,4 +1,17 @@ # Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # - Try to find lzo2 # Once done, this will define # diff --git a/CMake/Findzstd.cmake b/CMake/Findzstd.cmake index 86c1214492cd..a74adb0fbe00 100644 --- a/CMake/Findzstd.cmake +++ b/CMake/Findzstd.cmake @@ -1,4 +1,17 @@ # Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # - Try to find zstd # Once done, this will define # diff --git a/CODING_STYLE.md b/CODING_STYLE.md index 1f540e0c9356..43947192cd74 100644 --- a/CODING_STYLE.md +++ b/CODING_STYLE.md @@ -7,34 +7,50 @@ future. ## Code Formatting, Headers, and Licenses -Our Makefile contains targets to help highlight and fix format, header or -license issues. These targets are shortcuts for calling `./scripts/check.py`. +We use [pre-commit](https://pre-commit.com) to manage the installation and +execution of a number of code quality checks, called hooks. -Use `make header-fix` to apply our open source license headers to new files. -Use `make format-fix` to identify and fix formatting issues using clang-format. +### Installation -Formatting issues found on the changed lines in the current commit can be -displayed using `make format-check`. These issues can be fixed by using `make -format-fix`. This command will apply formatting changes to modified lines in -the current commit. +The recommended way to install pre-commit is through either +[`pipx`](https://pipx.pypa.io/stable/) or the newer +[`uv tool`](https://docs.astral.sh/uv/guides/tools/). Once you have +pre-commit available in your environment, you can enable running checks on +each commit by running `pre-commit install` in the root of the repository. -Header issues found on the changed files in the current commit can be displayed -using `make header-check`. These issues can be fixed by using `make header-fix`. -This will apply license header updates to the files in the current commit. +> [!TIP] +> This will take a few minutes the first time you run it, as `pre-commit` will +set up the environment for each hook by installing the required tool and +its dependencies in a separate environment to ensure reproducibility of the +check results. -An entire directory tree of files can be formatted and have license headers -added using the `tree` variant of the format commands: -``` - ./scripts/check.py format tree - ./scripts/check.py format tree --fix +The hooks are defined in `.pre-commit-config.yaml`. - ./scripts/check.py header tree - ./scripts/check.py header tree --fix -``` +After the setup is complete, each time you `git commit`, the hooks will be run +and potential changes applied to your *staged* files. Any unstaged files will +be stashed while the hooks run. If any changes occurred, the commit will *not* +succeed. The same happens when you `git push` but applies to all files that are being +pushed into the repository. + +You will have to review and stage the changed files and commit again. + +To manually run one specific hook, use `pre-commit run `, for example +`pre-commit run clang-format`. You can find the `hookid` in `.pre-commit-config.yaml`. + +By design, `pre-commit` will only be run on the files that are part of the commit +or push. If you want to run the checks on all files (including unstaged files), you can +run `pre-commit run --all-files`. + +The `clang-tidy` hook will *not* be run automatically as it takes a long time and +requires CMake to be run first to create `compile_commands.json`. +It can be run explicitly via `pre-commit run --hook-stage=manual`. + +If you need to *temporarily* skip the checks, you can use the git flag `--no-verify`. -All the available formatting commands can be displayed by using -`./scripts/check.py help`. +> [!Important] +> We also run the hooks as part of our CI, which will flag any issues introduced by +skipping the checks. ## C++ Style diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8b16b04e62cb..a76dd9e334cc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ Welcome! Thank you for your interest in the Velox project. Before starting to contribute, please take a moment to review the guidelines outlined below. Contributions are not just about code. Contributing code is great, but that’s -probably not the best place to start. There are many ways in which people can +probably not the best place to start. There are many ways in which people can make contributions to the project and community. ## Code of Conduct @@ -34,28 +34,28 @@ found here](https://velox-lib.io/docs/community/components-and-maintainers). ## Documentation Help the community understand how to use the Velox library by proposing -additions to our [docs](https://facebookincubator.github.io/velox/index.html) or pointing +additions to our [docs](https://facebookincubator.github.io/velox/index.html) or pointing out outdated or missing pieces. ## Bug Reports Found a bug? Help us by filing an issue on GitHub. -Ensure the bug was not already reported by searching +Ensure the bug was not already reported by searching [GitHub Issues](https://github.com/facebookincubator/velox/issues). If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior. -Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure -of security bugs. In those cases, please go through the process outlined on that page +Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure +of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. ## Code Contribution Process -The code contribution process is designed to reduce the burden on reviewers and -maintainers, allowing them to provide more timely feedback and keeping the +The code contribution process is designed to reduce the burden on reviewers and +maintainers, allowing them to provide more timely feedback and keeping the amount of rework from contributors to a minimum. We encourage new contributors to start with bug fixes and small features so you @@ -105,11 +105,11 @@ The contribution process is outlined below: reviewer(s) by name, stating the comments have been addressed. This is the best way to ensure that the reviewer is notified that the code is ready to be reviewed again. - * As a PR author, please do not "Resolve Conversation" when review comments are + * As a PR author, please do not "Resolve Conversation" when review comments are addressed. Instead, wait for the reviewer to verify the comment has been addressed and resolve the conversation. -7. Iterate on this process until your changes are reviewed and accepted by a +7. Iterate on this process until your changes are reviewed and accepted by a maintainer. At this point, a Meta employee will be notified to merge your PR, due to tooling limitations. @@ -193,35 +193,35 @@ write great commit messages: When submitting code contributions to Velox, make sure to adhere to the following best practices: -1. **Coding Style**: Review and strictly follow our coding style document, +1. **Coding Style**: Review and strictly follow our coding style document, available in [`CODING_STYLE.md`](CODING_STYLE.md). - * Velox favors consistency over personal preference. If there are - technical reasons why a specific guideline should not be followed, + * Velox favors consistency over personal preference. If there are + technical reasons why a specific guideline should not be followed, please start a separate discussion with the community to update the coding style document first. - * If you are simply updating code that did not comply with the coding - style, please do so in a standalone PR isolated from other logic changes. + * If you are simply updating code that did not comply with the coding + style, please do so in a standalone PR isolated from other logic changes. 2. **Small Incremental Changes**: If the change is large, work with the maintainers on a plan to break and submit it as smaller (yet atomic) parts. - * [Research indicates](https://smartbear.com/learn/code-review/best-practices-for-peer-code-review/) - that engineers can only effectively review up to - 400 lines of code at a time. The human brain can only process so much information + * [Research indicates](https://smartbear.com/learn/code-review/best-practices-for-peer-code-review/) + that engineers can only effectively review up to + 400 lines of code at a time. The human brain can only process so much information at a time; beyond that threshold the ability to find bugs and other flaws decreases. - * As larger PRs usually take longer to review and iterate, they - tend to slow down the software development process. As much as possible, - split your work into smaller changes. + * As larger PRs usually take longer to review and iterate, they + tend to slow down the software development process. As much as possible, + split your work into smaller changes. 3. **Unit tests**: With rare exceptions, every PR should contain unit tests covering the logic added/modified. * Unit tests protect our codebase from regressions, promote less coupled APIs, and provide an executable form of documentation that’s useful for new engineers reasoning about the codebase. - * Good unit tests are fast, isolated, repeatable, and exercise all APIs + * Good unit tests are fast, isolated, repeatable, and exercise all APIs including their edge cases. * The lack of existing tests is not a good reason not to add tests to - your PR. If a component or API does not have a corresponding + your PR. If a component or API does not have a corresponding unit test suite, please consider improving the codebase by first adding a new unit test suite to ensure the existing behavior is correct. @@ -232,9 +232,9 @@ following best practices: obvious and remove obscurity. * As a guideline, every file, class, member variable, and member function that is not a getter/setter should be documented. - * As much as possible, try to avoid functions with very large bodies. In the - (rare) cases where large code blocks are needed, a good practice is to group - smaller blocks of related code, and precede them with a blank line and a + * As much as possible, try to avoid functions with very large bodies. In the + (rare) cases where large code blocks are needed, a good practice is to group + smaller blocks of related code, and precede them with a blank line and a high-level comment explaining what the block does. 5. **Benchmarks**: Add micro-benchmarks to support your claims. @@ -242,8 +242,8 @@ following best practices: efficiency trade-offs. 6. **APIs**: Carefully design APIs. - * As a library, Velox APIs should be intentional. External API should only - be deliberately created. + * As a library, Velox APIs should be intentional. External API should only + be deliberately created. * As a rule of thumb, components should be deep and encapsulate as much complexity as possible, and APIs should be narrow, minimizing dependencies across components and preventing implementation details from leaking through @@ -261,20 +261,20 @@ with a benchmark. 2. Use the following template for the PR title: Add xxx [Presto|Spark] function (replace xxx with the function name). * Ensure the PR description contains a link to the function documentation - from Presto or Spark docs. + from Presto or Spark docs. * Describe the function semantics and edge cases clearly. -3. Use Presto or Spark to check the function semantics. +3. Use Presto or Spark to check the function semantics. * When implementing a Spark function, check the function semantics using Spark 3.5 with ANSI OFF. * Try different edge cases to check whether the function returns null, or - throws, etc. + throws, etc. * Make sure to replicate the exact semantics. -4. Add tests exercising common inputs, all possible signatures and corner cases. - * Make sure the test cases are concise and easily readable. +4. Add tests exercising common inputs, all possible signatures and corner cases. + * Make sure the test cases are concise and easily readable. -5. Make sure that obvious inefficiencies are addressed. - * If appropriate, provide micro-benchmarks to support your claims with data. +5. Make sure that obvious inefficiencies are addressed. + * If appropriate, provide micro-benchmarks to support your claims with data. 4. Add documentation for the new function to an .rst file under velox/docs/functions directory. * Functions in documentation are listed in alphabetical order. Make sure to diff --git a/Makefile b/Makefile index 1e12802cff0e..be290e806ea8 100644 --- a/Makefile +++ b/Makefile @@ -185,48 +185,6 @@ fuzzertest: debug --logtostderr=1 \ --minloglevel=0 -format-fix: #: Fix formatting issues in the main branch -ifneq ("$(wildcard ${PYTHON_VENV}/pyvenv.cfg)","") - source ${PYTHON_VENV}/bin/activate; scripts/check.py format main --fix -else - scripts/check.py format main --fix -endif - -format-check: #: Check for formatting issues on the main branch - clang-format --version -ifneq ("$(wildcard ${PYTHON_VENV}/pyvenv.cfg)","") - source ${PYTHON_VENV}/bin/activate; scripts/check.py format main -else - scripts/check.py format main -endif - -header-fix: #: Fix license header issues in the current branch -ifneq ("$(wildcard ${PYTHON_VENV}/pyvenv.cfg)","") - source ${PYTHON_VENV}/bin/activate; scripts/check.py header main --fix -else - scripts/check.py header main --fix -endif - -header-check: #: Check for license header issues on the main branch -ifneq ("$(wildcard ${PYTHON_VENV}/pyvenv.cfg)","") - source ${PYTHON_VENV}/bin/activate; scripts/check.py header main -else - scripts/check.py header main -endif - -circleci-container: #: Build the linux container for CircleCi - $(MAKE) linux-container CONTAINER_NAME=circleci - -check-container: - $(MAKE) linux-container CONTAINER_NAME=check - -linux-container: - rm -rf /tmp/docker && \ - mkdir -p /tmp/docker && \ - cp scripts/setup-helper-functions.sh scripts/setup-$(CONTAINER_NAME).sh scripts/$(CONTAINER_NAME)-container.dockfile /tmp/docker && \ - cd /tmp/docker && \ - docker build --build-arg cpu_target=$(CPU_TARGET) --tag "prestocpp/velox-$(CPU_TARGET)-$(CONTAINER_NAME):${USER}-$(shell date +%Y%m%d)" -f $(CONTAINER_NAME)-container.dockfile . - help: #: Show the help messages @cat $(firstword $(MAKEFILE_LIST)) | \ awk '/^[-a-z]+:/' | \ diff --git a/NOTICE.txt b/NOTICE.txt index 8b812aa41ab2..4fb5849fba09 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -10,7 +10,7 @@ This product includes software from the QT project (BSD, 3-clause). This product includes software from HowardHinnant's date library (MIT License). * https://github.com/HowardHinnant/date/tree/master -This product includes software from the The Arrow project. +This product includes software from the Arrow project. * https://github.com/apache/arrow/blob/apache-arrow-15.0.0/cpp/src/arrow/io/hdfs_internal.h * https://github.com/apache/arrow/blob/apache-arrow-15.0.0/cpp/src/arrow/io/hdfs_internal.cc Which contain the following NOTICE file: diff --git a/docker-compose.yml b/docker-compose.yml index 7dfc8752d243..d7cc50d1a8f0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -28,7 +28,7 @@ services: environment: NUM_THREADS: 8 # default value for NUM_THREADS VELOX_DEPENDENCY_SOURCE: BUNDLED # Build dependencies from source - CCACHE_DIR: "/velox/.ccache" + CCACHE_DIR: /velox/.ccache volumes: - .:/velox:delegated command: scripts/docker/docker-command.sh @@ -51,9 +51,10 @@ services: image: ghcr.io/facebookincubator/velox-dev:centos9 environment: NUM_THREADS: 8 # default value for NUM_THREADS - CCACHE_DIR: "/velox/.ccache" - EXTRA_CMAKE_FLAGS: -DVELOX_ENABLE_PARQUET=ON - -DVELOX_ENABLE_S3=ON + CCACHE_DIR: /velox/.ccache + EXTRA_CMAKE_FLAGS: > + -DVELOX_ENABLE_PARQUET=ON + -DVELOX_ENABLE_S3=ON volumes: - .:/velox:delegated working_dir: /velox @@ -77,9 +78,10 @@ services: image: ghcr.io/facebookincubator/velox-dev:centos9 environment: NUM_THREADS: 8 # default value for NUM_THREADS - CCACHE_DIR: "/velox/.ccache" - EXTRA_CMAKE_FLAGS: -DVELOX_ENABLE_PARQUET=ON - -DVELOX_ENABLE_S3=ON + CCACHE_DIR: /velox/.ccache + EXTRA_CMAKE_FLAGS: > + -DVELOX_ENABLE_PARQUET=ON + -DVELOX_ENABLE_S3=ON privileged: true deploy: resources: @@ -93,7 +95,6 @@ services: working_dir: /velox command: /velox/scripts/docker/docker-command.sh - centos-cpp: # Usage: # docker-compose pull centos-cpp or docker-compose build centos-cpp @@ -109,19 +110,19 @@ services: image: quay.io/centos/centos:stream9 environment: NUM_THREADS: 8 # default value for NUM_THREADS - CCACHE_DIR: "/velox/.ccache" + CCACHE_DIR: /velox/.ccache volumes: - .:/velox:delegated working_dir: /velox command: /velox/scripts/docker/docker-command.sh presto-java: - # Usage: - # docker-compose pull presto-java or docker-compose build presto-java - # docker-compose run --rm presto-java - # or - # docker-compose run -e NUM_THREADS= --rm presto-java - # to set the number of threads used during compilation + # Usage: + # docker-compose pull presto-java or docker-compose build presto-java + # docker-compose run --rm presto-java + # or + # docker-compose run -e NUM_THREADS= --rm presto-java + # to set the number of threads used during compilation image: ghcr.io/facebookincubator/velox-dev:presto-java build: args: @@ -130,19 +131,19 @@ services: dockerfile: scripts/docker/prestojava-container.dockerfile environment: NUM_THREADS: 8 # default value for NUM_THREADS - CCACHE_DIR: "/velox/.ccache" + CCACHE_DIR: /velox/.ccache volumes: - .:/velox:delegated working_dir: /velox command: /velox/scripts/docker/docker-command.sh spark-server: - # Usage: - # docker-compose pull spark-server or docker-compose build spark-server - # docker-compose run --rm spark-server - # or - # docker-compose run -e NUM_THREADS= --rm spark-server - # to set the number of threads used during compilation + # Usage: + # docker-compose pull spark-server or docker-compose build spark-server + # docker-compose run --rm spark-server + # or + # docker-compose run -e NUM_THREADS= --rm spark-server + # to set the number of threads used during compilation image: ghcr.io/facebookincubator/velox-dev:spark-server build: args: @@ -151,7 +152,7 @@ services: dockerfile: scripts/docker/spark-container.dockerfile environment: NUM_THREADS: 8 # default value for NUM_THREADS - CCACHE_DIR: "/velox/.ccache" + CCACHE_DIR: /velox/.ccache volumes: - .:/velox:delegated working_dir: /velox diff --git a/pyproject.toml b/pyproject.toml index adb39ac36343..439d447e583d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,3 @@ -#!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/python/pyvelox/__init__.py b/python/pyvelox/__init__.py index cd7b54a784b3..9bfe654c9eb6 100644 --- a/python/pyvelox/__init__.py +++ b/python/pyvelox/__init__.py @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .legacy import * -from .legacy import __version__ +from .legacy import * # noqa: F403 +from .legacy import __version__ as __version__ diff --git a/python/pyvelox/utils/__init__.py b/python/pyvelox/utils/__init__.py index ddb2e2b74b00..ef7f461f1d49 100644 --- a/python/pyvelox/utils/__init__.py +++ b/python/pyvelox/utils/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .data_generator import generate_tpch_data +from .data_generator import generate_tpch_data as generate_tpch_data diff --git a/python/pyvelox/utils/data_generator.py b/python/pyvelox/utils/data_generator.py old mode 100644 new mode 100755 index 526acd150711..0ccec166619a --- a/python/pyvelox/utils/data_generator.py +++ b/python/pyvelox/utils/data_generator.py @@ -157,7 +157,7 @@ def main() -> int: result = generate_tpch_data(**vars(args)) logging.info( - f"Written {result.row_count} records to {result.file_count} output files at '{result.output_path}'" # pyre-ignore + f"Written {result.row_count} records to {result.file_count} output files at '{result.output_path}'" # pyre-ignore ) return 0 if result else 1 diff --git a/python/pyvelox/utils/run_queries.py b/python/pyvelox/utils/run_queries.py old mode 100644 new mode 100755 diff --git a/python/test/test_plan_builder.py b/python/test/test_plan_builder.py index 180d0f3d3dbd..c0c1c8d1ec4c 100644 --- a/python/test/test_plan_builder.py +++ b/python/test/test_plan_builder.py @@ -55,7 +55,7 @@ def test_plan_builder(self): self.assertEqual( str(filter_node), - "-- Filter[2]\n" " -- Project[1]\n" " -- TableScan[0]\n", + "-- Filter[2]\n -- Project[1]\n -- TableScan[0]\n", ) def test_multiple_plan_builders(self): diff --git a/python/test/test_vector.py b/python/test/test_vector.py index f829eaaa4bf2..a8a97ffd9010 100644 --- a/python/test/test_vector.py +++ b/python/test/test_vector.py @@ -188,13 +188,13 @@ def test_array_vector(self): self.assertEqual(expected_firstElements[i], elements[i]) with self.assertRaises(TypeError): - a = pv.from_list([[[1, 2], [3, 4]], [[1.1], [2.3]]]) + _a = pv.from_list([[[1, 2], [3, 4]], [[1.1], [2.3]]]) with self.assertRaises(ValueError): - v = pv.from_list([[None], [None, None, None]]) + _v = pv.from_list([[None], [None, None, None]]) with self.assertRaises(TypeError): - a = pv.from_list([[[1, 2], [3, 4]], [["hello"], ["world"]]]) + _a = pv.from_list([[[1, 2], [3, 4]], [["hello"], ["world"]]]) def test_to_string(self): self.assertEqual( @@ -271,8 +271,8 @@ def test_numeric_limits(self): bigger_than_int32 = pv.from_list([1 << 33]) self.assertEqual(bigger_than_int32[0], 1 << 33) with self.assertRaises(RuntimeError): - bigger_than_int64 = pv.from_list([1 << 63]) - smaller_than_int64 = pv.from_list([(1 << 62) + (1 << 62) - 1]) + _bigger_than_int64 = pv.from_list([1 << 63]) + _smaller_than_int64 = pv.from_list([(1 << 62) + (1 << 62) - 1]) def test_type(self): ints = pv.from_list([1, 2, None]) @@ -334,7 +334,7 @@ def test_slice(self): self.assertEqual(b[i], i + 2) with self.assertRaises(NotImplementedError): - c = a.slice(2, 6, 2) + _c = a.slice(2, 6, 2) d = a[3:6] self.assertEqual(len(d), 3) @@ -342,7 +342,7 @@ def test_slice(self): self.assertEqual(d[i], i + 3) with self.assertRaises(NotImplementedError): - e = a[3:8:3] + _e = a[3:8:3] def test_export_to_arrow(self): test_cases = [ diff --git a/scripts/check.py b/scripts/check.py deleted file mode 100755 index fda77f2b631a..000000000000 --- a/scripts/check.py +++ /dev/null @@ -1,304 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -from collections import OrderedDict -import os -import regex -import subprocess -import sys - -from util import attrdict -import util - -EXTENSIONS = "cpp,h,inc,prolog" -SCRIPTS = util.script_path() - - -def get_diff(file, formatted): - if not formatted.endswith("\n"): - formatted = formatted + "\n" - - status, stdout, stderr = util.run( - f"diff -u {file} --label {file} --label {file} -", input=formatted - ) - if stdout != "": - stdout = f"diff a/{file} b/{file}\n" + stdout - - return status, stdout, stderr - - -class CppFormatter(str): - def diff(self, commit): - if commit == "": - return get_diff(self, util.run(f"clang-format --style=file {self}")[1]) - else: - return util.run( - f"{SCRIPTS}/git-clang-format -q --extensions='{EXTENSIONS}' --diff --style=file {commit} {self}" - ) - - def fix(self, commit): - if commit == "": - return util.run(f"clang-format -i --style=file {self}")[0] == 0 - else: - return ( - util.run( - f"{SCRIPTS}/git-clang-format -q --extensions='{EXTENSIONS}' --style=file {commit} {self}" - )[0] - == 0 - ) - - -class CMakeFormatter(str): - def __init__(self, commit) -> None: - super().__init__() - try: - import yaml - except ModuleNotFoundError: - # We need pyyaml so cmake-format can read '.cmake-format.yml' - # otherwise it will run with default - raise SystemExit("Please install 'pyyaml' for the CMake formatter.") - - def diff(self, commit): - return get_diff( - self, util.run(f"cmake-format --first-comment-is-literal True {self}")[1] - ) - - def fix(self, commit): - return ( - util.run(f"cmake-format --first-comment-is-literal True -i {self}")[0] == 0 - ) - - -class PythonFormatter(str): - def diff(self, commit): - return util.run(f"black -q --diff {self}") - - def fix(self, commit): - return util.run(f"black -q {self}")[0] == 0 - - -format_file_types = OrderedDict( - { - "CMakeLists.txt": attrdict({"formatter": CMakeFormatter}), - "*.cmake": attrdict({"formatter": CMakeFormatter}), - "*.cpp": attrdict({"formatter": CppFormatter}), - "*.h": attrdict({"formatter": CppFormatter}), - "*.inc": attrdict({"formatter": CppFormatter}), - "*.prolog": attrdict({"formatter": CppFormatter}), - "*.hpp": attrdict({"formatter": CppFormatter}), - "*.cu": attrdict({"formatter": CppFormatter}), - "*.cuh": attrdict({"formatter": CppFormatter}), - "*.clcpp": attrdict({"formatter": CppFormatter}), - "*.mm": attrdict({"formatter": CppFormatter}), - "*.metal": attrdict({"formatter": CppFormatter}), - "*.py": attrdict({"formatter": PythonFormatter}), - } -) - - -def get_formatter(filename): - if filename in format_file_types: - return format_file_types[filename] - - return format_file_types.get("*" + util.get_fileextn(filename), None) - - -def format_command(commit, files, fix): - ok = 0 - for filepath in files: - filename = util.get_filename(filepath) - filetype = get_formatter(filename) - - if filetype is None: - print("Skip : " + filepath, file=sys.stderr) - continue - - file = filetype.formatter(filepath) - - if fix == "show": - status, diff, stderr = file.diff(commit) - - if stderr != "": - ok = 1 - print(f"Error: {file}", file=sys.stderr) - continue - - if diff != "" and diff != "no modified files to format": - ok = 1 - print(f"Fix : {file}", file=sys.stderr) - print(diff) - else: - print(f"Ok : {file}", file=sys.stderr) - - else: - print(f"Fix : {file}", file=sys.stderr) - if not file.fix(commit): - ok = 1 - print(f"Error: {file}", file=sys.stderr) - - return ok - - -def header_command(commit, files, fix): - options = "-vk" if fix == "show" else "-i" - - status, stdout, stderr = util.run( - f"{SCRIPTS}/license-header.py {options} -", input=files - ) - - if stdout != "": - print(stdout) - - return status - - -def tidy_command(commit, files, fix): - files = [file for file in files if regex.match(r".*\.cpp$", file)] - - if not files: - return 0 - - commit = f"--commit {commit}" if commit != "" else "" - fix = "--fix" if fix == "fix" else "" - - status, stdout, stderr = util.run( - f"{SCRIPTS}/run-clang-tidy.py {commit} {fix} -", input=files - ) - - if stdout != "": - print(stdout) - - return status - - -def get_commit(files): - if files == "commit": - return "HEAD^" - - if files == "main" or files == "master": - return util.run(f"git merge-base origin/{files} HEAD")[1] - - return "" - - -def get_files(commit, path): - filelist = [] - - if commit != "": - status, stdout, stderr = util.run( - f"git diff --relative --name-only --diff-filter='ACMR' {commit}" - ) - filelist = stdout.splitlines() - else: - if os.path.isfile(path): - filelist.append(path) - else: - for root, dirs, files in os.walk(path): - for name in files: - filelist.append(os.path.join(root, name)) - - return [ - file - for file in filelist - if "/data/" not in file - and "velox/external/" not in file - and "build/fbcode_builder" not in file - and "build/deps" not in file - and "cmake-build-debug" not in file - and "NOTICE.txt" != file - and "velox/docs/affiliations_map.txt" != file - ] - - -def help(args): - parser.print_help() - return 0 - - -def add_check_options(subparser, name): - parser = subparser.add_parser(name) - parser.add_argument("--fix", action="store_const", default="show", const="fix") - return parser - - -def add_options(parser): - files = parser.add_subparsers(dest="files") - - tree_parser = add_check_options(files, "tree") - tree_parser.add_argument("path", default="") - - branch_parser = add_check_options(files, "main") - branch_parser = add_check_options(files, "master") - commit_parser = add_check_options(files, "commit") - - -def add_check_command(parser, name): - subparser = parser.add_parser(name) - add_options(subparser) - - return subparser - - -def parse_args(): - global parser - parser = argparse.ArgumentParser( - formatter_class=argparse.RawTextHelpFormatter, - description="""Check format/header/tidy - - check.py {format,header,tidy} {commit,branch} [--fix] - check.py {format,header,tidy} {tree} [--fix] PATH -""", - ) - command = parser.add_subparsers(dest="command") - command.add_parser("help") - - format_command_parser = add_check_command(command, "format") - header_command_parser = add_check_command(command, "header") - tidy_command_parser = add_check_command(command, "tidy") - - parser.set_defaults(path="") - parser.set_defaults(command="help") - - return parser.parse_args() - - -def run_command(args, command): - commit = get_commit(args.files) - files = get_files(commit, args.path) - - return command(commit, files, args.fix) - - -def format(args): - return run_command(args, format_command) - - -def header(args): - return run_command(args, header_command) - - -def tidy(args): - return run_command(args, tidy_command) - - -def main(): - args = parse_args() - return globals()[args.command](args) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/license-header.py b/scripts/checks/license-header.py similarity index 98% rename from scripts/license-header.py rename to scripts/checks/license-header.py index a31df3f815a9..6f2ebb52e809 100755 --- a/scripts/license-header.py +++ b/scripts/checks/license-header.py @@ -28,7 +28,9 @@ class attrdict(dict): def parse_args(): parser = argparse.ArgumentParser(description="Update license headers") - parser.add_argument("--header", default="license.header", help="header file") + parser.add_argument( + "--header", default="scripts/checks/license.header", help="header file" + ) parser.add_argument( "--extra", default=80, diff --git a/license.header b/scripts/checks/license.header similarity index 100% rename from license.header rename to scripts/checks/license.header diff --git a/scripts/run-clang-tidy.py b/scripts/checks/run-clang-tidy.py similarity index 99% rename from scripts/run-clang-tidy.py rename to scripts/checks/run-clang-tidy.py index 00a80deeb194..5d7016c3d9ed 100755 --- a/scripts/run-clang-tidy.py +++ b/scripts/checks/run-clang-tidy.py @@ -14,7 +14,6 @@ # limitations under the License. import argparse -from itertools import groupby import json import regex import sys diff --git a/scripts/util.py b/scripts/checks/util.py similarity index 98% rename from scripts/util.py rename to scripts/checks/util.py index 3ed836c63db1..c0b34532a945 100644 --- a/scripts/util.py +++ b/scripts/checks/util.py @@ -37,7 +37,7 @@ def run(command, compressed=False, **kwargs): if "input" in kwargs: input = kwargs["input"] - if type(input) == list: + if type(input) is list: input = "\n".join(input) + "\n" kwargs["input"] = input.encode("utf-8") diff --git a/scripts/ci/benchmark-runner.py b/scripts/ci/benchmark-runner.py index f49deed1f156..8eba18282f3c 100755 --- a/scripts/ci/benchmark-runner.py +++ b/scripts/ci/benchmark-runner.py @@ -81,7 +81,7 @@ def get_retry_name(args, file_name): path = _normalize_path(file_name) try: parent_path = path.relative_to(_normalize_path(args.contender_path)) - except: + except Exception: parent_path = path.relative_to(_normalize_path(args.baseline_path)) return str(parent_path.parent) @@ -455,8 +455,7 @@ def parse_args(): parser_run.add_argument( "--binary_filter", default=None, - help="Filter applied to binary names. " - "By default execute all binaries found.", + help="Filter applied to binary names. By default execute all binaries found.", ) parser_run.add_argument( "--bm_filter", diff --git a/scripts/ci/bm-report/build-metrics.py b/scripts/ci/bm-report/build-metrics.py index 3829d403845a..57497f4d5b7f 100755 --- a/scripts/ci/bm-report/build-metrics.py +++ b/scripts/ci/bm-report/build-metrics.py @@ -14,7 +14,6 @@ # limitations under the License. import argparse -import sys import uuid from os.path import join, splitext from pathlib import Path @@ -125,8 +124,9 @@ def _transform_results(self) -> List[BenchmarkResult]: else: del log_lines[0] - ms2sec = lambda x: x / 1000 - get_epoch = lambda l: int(l.split()[2]) + def ms2sec(x): + return x / 1000 + totals = { "link_time": 0, "compile_time": 0, diff --git a/scripts/ci/bm-report/report.qmd b/scripts/ci/bm-report/report.qmd index 0c41ae0377a8..a91a116a6b93 100644 --- a/scripts/ci/bm-report/report.qmd +++ b/scripts/ci/bm-report/report.qmd @@ -222,7 +222,7 @@ searchable_table( :::: -### Debug +### Debug :::: {layout="[[50, 50],[50, 50]]" } ::: {} @@ -320,7 +320,7 @@ searchable_table( ::: ::: {} -```{r sizes-release} +```{r sizes-release} searchable_table( object_sizes_static, "release", "static", "Size", "Binary Sizes - Static" @@ -331,7 +331,7 @@ searchable_table( :::: -### Debug +### Debug :::: {layout="[50, 50]" } ::: {} @@ -353,4 +353,3 @@ searchable_table( ::: :::: - diff --git a/scripts/ci/hdfs-client.xml b/scripts/ci/hdfs-client.xml index 77b0ce611874..389b5376c0ac 100644 --- a/scripts/ci/hdfs-client.xml +++ b/scripts/ci/hdfs-client.xml @@ -4,4 +4,4 @@ dfs.client.log.severity FATAL - \ No newline at end of file + diff --git a/scripts/ci/presto/etc/hive.properties b/scripts/ci/presto/etc/hive.properties index e9a0d05c76a7..1ea8272de8c8 100644 --- a/scripts/ci/presto/etc/hive.properties +++ b/scripts/ci/presto/etc/hive.properties @@ -1,4 +1,4 @@ connector.name=hive-hadoop2 hive.metastore=file hive.metastore.catalog.dir=file:/opt/presto-server/etc/data -hive.allow-drop-table=true \ No newline at end of file +hive.allow-drop-table=true diff --git a/scripts/ci/presto/start-prestojava.sh b/scripts/ci/presto/start-prestojava.sh index 290e43af8afe..4a02636aa820 100755 --- a/scripts/ci/presto/start-prestojava.sh +++ b/scripts/ci/presto/start-prestojava.sh @@ -16,4 +16,3 @@ set -e "$PRESTO_HOME"/bin/launcher --pid-file=/tmp/pidfile run - diff --git a/scripts/ci/signature.py b/scripts/ci/signature.py index 51698a056eb7..daa876942f1a 100644 --- a/scripts/ci/signature.py +++ b/scripts/ci/signature.py @@ -146,7 +146,7 @@ def diff_signatures(base_signatures, contender_signatures, error_path=""): if "repetition_change" in delta: error_message = "" for rep_change in delta["repetition_change"]: - error_message += f"""'{rep_change.get_root_key()}{rep_change.t1}' is repeated {rep_change.repetition['new_repeat']} times.\n""" + error_message += f"""'{rep_change.get_root_key()}{rep_change.t1}' is repeated {rep_change.repetition["new_repeat"]} times.\n""" show_error(error_message, error_path) exit_status = 1 diff --git a/scripts/docker/pyvelox.dockerfile b/scripts/docker/pyvelox.dockerfile index c700ab371ed7..5757345ef9b8 100644 --- a/scripts/docker/pyvelox.dockerfile +++ b/scripts/docker/pyvelox.dockerfile @@ -28,4 +28,3 @@ RUN mkdir build && ( cd build && bash /setup-manylinux.sh ) && rm -rf build && \ dnf clean all ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH" - diff --git a/scripts/git-clang-format b/scripts/git-clang-format deleted file mode 100755 index 46e7f5cd0ca7..000000000000 --- a/scripts/git-clang-format +++ /dev/null @@ -1,622 +0,0 @@ -#!/usr/bin/env python3 -# -#===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===# -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#============================================================================== -#LLVM Release License -#============================================================================== -#University of Illinois/NCSA -#Open Source License -# -#Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. -#All rights reserved. -# -#Developed by: -# -# LLVM Team -# -# University of Illinois at Urbana-Champaign -# -# http://llvm.org -# -#Permission is hereby granted, free of charge, to any person obtaining a copy of -#this software and associated documentation files (the "Software"), to deal with -#the Software without restriction, including without limitation the rights to -#use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -#of the Software, and to permit persons to whom the Software is furnished to do -#so, subject to the following conditions: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimers. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimers in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the names of the LLVM Team, University of Illinois at -# Urbana-Champaign, nor the names of its contributors may be used to -# endorse or promote products derived from this Software without specific -# prior written permission. -# -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -#FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE -#SOFTWARE. -#===------------------------------------------------------------------------===# - -r""" -clang-format git integration -============================ - -This file provides a clang-format integration for git. Put it somewhere in your -path and ensure that it is executable. Then, "git clang-format" will invoke -clang-format on the changes in current files or a specific commit. - -For further details, run: -git clang-format -h - -Requires Python 2.7 or Python 3 -""" - -from __future__ import absolute_import, division, print_function -import argparse -import collections -import contextlib -import errno -import os -import re -import subprocess -import sys - -usage = 'git clang-format [OPTIONS] [] [] [--] [...]' - -desc = ''' -If zero or one commits are given, run clang-format on all lines that differ -between the working directory and , which defaults to HEAD. Changes are -only applied to the working directory. - -If two commits are given (requires --diff), run clang-format on all lines in the -second that differ from the first . - -The following git-config settings set the default of the corresponding option: - clangFormat.binary - clangFormat.commit - clangFormat.extension - clangFormat.style -''' - -# Name of the temporary index file in which save the output of clang-format. -# This file is created within the .git directory. -temp_index_basename = 'clang-format-index' - - -Range = collections.namedtuple('Range', 'start, count') - - -def main(): - config = load_git_config() - - # In order to keep '--' yet allow options after positionals, we need to - # check for '--' ourselves. (Setting nargs='*' throws away the '--', while - # nargs=argparse.REMAINDER disallows options after positionals.) - argv = sys.argv[1:] - try: - idx = argv.index('--') - except ValueError: - dash_dash = [] - else: - dash_dash = argv[idx:] - argv = argv[:idx] - - default_extensions = ','.join([ - # From clang/lib/Frontend/FrontendOptions.cpp, all lower case - 'c', 'h', # C - 'm', # ObjC - 'mm', # ObjC++ - 'cc', 'cp', 'cpp', 'c++', 'cxx', 'hpp', # C++ - 'cu', # CUDA - # Other languages that clang-format supports - 'proto', 'protodevel', # Protocol Buffers - 'java', # Java - 'js', # JavaScript - 'ts', # TypeScript - ]) - - p = argparse.ArgumentParser( - usage=usage, formatter_class=argparse.RawDescriptionHelpFormatter, - description=desc) - p.add_argument('--binary', - default=config.get('clangformat.binary', 'clang-format'), - help='path to clang-format'), - p.add_argument('--commit', - default=config.get('clangformat.commit', 'HEAD'), - help='default commit to use if none is specified'), - p.add_argument('--diff', action='store_true', - help='print a diff instead of applying the changes') - p.add_argument('--extensions', - default=config.get('clangformat.extensions', - default_extensions), - help=('comma-separated list of file extensions to format, ' - 'excluding the period and case-insensitive')), - p.add_argument('-f', '--force', action='store_true', - help='allow changes to unstaged files') - p.add_argument('-p', '--patch', action='store_true', - help='select hunks interactively') - p.add_argument('-q', '--quiet', action='count', default=0, - help='print less information') - p.add_argument('--style', - default=config.get('clangformat.style', None), - help='passed to clang-format'), - p.add_argument('-v', '--verbose', action='count', default=0, - help='print extra information') - # We gather all the remaining positional arguments into 'args' since we need - # to use some heuristics to determine whether or not was present. - # However, to print pretty messages, we make use of metavar and help. - p.add_argument('args', nargs='*', metavar='', - help='revision from which to compute the diff') - p.add_argument('ignored', nargs='*', metavar='...', - help='if specified, only consider differences in these files') - opts = p.parse_args(argv) - - opts.verbose -= opts.quiet - del opts.quiet - - commits, files = interpret_args(opts.args, dash_dash, opts.commit) - if len(commits) > 1: - if not opts.diff: - die('--diff is required when two commits are given') - else: - if len(commits) > 2: - die('at most two commits allowed; %d given' % len(commits)) - changed_lines = compute_diff_and_extract_lines(commits, files) - if opts.verbose >= 1: - ignored_files = set(changed_lines) - filter_by_extension(changed_lines, opts.extensions.lower().split(',')) - if opts.verbose >= 1: - ignored_files.difference_update(changed_lines) - if ignored_files: - print('Ignoring changes in the following files (wrong extension):') - for filename in ignored_files: - print(' %s' % filename) - if changed_lines: - print('Running clang-format on the following files:') - for filename in changed_lines: - print(' %s' % filename) - if not changed_lines: - print('no modified files to format') - return - # The computed diff outputs absolute paths, so we must cd before accessing - # those files. - cd_to_toplevel() - if len(commits) > 1: - old_tree = commits[1] - new_tree = run_clang_format_and_save_to_tree(changed_lines, - revision=commits[1], - binary=opts.binary, - style=opts.style) - else: - old_tree = create_tree_from_workdir(changed_lines) - new_tree = run_clang_format_and_save_to_tree(changed_lines, - binary=opts.binary, - style=opts.style) - if opts.verbose >= 1: - print('old tree: %s' % old_tree) - print('new tree: %s' % new_tree) - if old_tree == new_tree: - if opts.verbose >= 0: - print('clang-format did not modify any files') - elif opts.diff: - print_diff(old_tree, new_tree) - else: - changed_files = apply_changes(old_tree, new_tree, force=opts.force, - patch_mode=opts.patch) - if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1: - print('changed files:') - for filename in changed_files: - print(' %s' % filename) - - -def load_git_config(non_string_options=None): - """Return the git configuration as a dictionary. - - All options are assumed to be strings unless in `non_string_options`, in which - is a dictionary mapping option name (in lower case) to either "--bool" or - "--int".""" - if non_string_options is None: - non_string_options = {} - out = {} - for entry in run('git', 'config', '--list', '--null').split('\0'): - if entry: - name, value = entry.split('\n', 1) - if name in non_string_options: - value = run('git', 'config', non_string_options[name], name) - out[name] = value - return out - - -def interpret_args(args, dash_dash, default_commit): - """Interpret `args` as "[commits] [--] [files]" and return (commits, files). - - It is assumed that "--" and everything that follows has been removed from - args and placed in `dash_dash`. - - If "--" is present (i.e., `dash_dash` is non-empty), the arguments to its - left (if present) are taken as commits. Otherwise, the arguments are checked - from left to right if they are commits or files. If commits are not given, - a list with `default_commit` is used.""" - if dash_dash: - if len(args) == 0: - commits = [default_commit] - else: - commits = args - for commit in commits: - object_type = get_object_type(commit) - if object_type not in ('commit', 'tag'): - if object_type is None: - die("'%s' is not a commit" % commit) - else: - die("'%s' is a %s, but a commit was expected" % (commit, object_type)) - files = dash_dash[1:] - elif args: - commits = [] - while args: - if not disambiguate_revision(args[0]): - break - commits.append(args.pop(0)) - if not commits: - commits = [default_commit] - files = args - else: - commits = [default_commit] - files = [] - return commits, files - - -def disambiguate_revision(value): - """Returns True if `value` is a revision, False if it is a file, or dies.""" - # If `value` is ambiguous (neither a commit nor a file), the following - # command will die with an appropriate error message. - run('git', 'rev-parse', value, verbose=False) - object_type = get_object_type(value) - if object_type is None: - return False - if object_type in ('commit', 'tag'): - return True - die('`%s` is a %s, but a commit or filename was expected' % - (value, object_type)) - - -def get_object_type(value): - """Returns a string description of an object's type, or None if it is not - a valid git object.""" - cmd = ['git', 'cat-file', '-t', value] - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - if p.returncode != 0: - return None - return convert_string(stdout.strip()) - - -def compute_diff_and_extract_lines(commits, files): - """Calls compute_diff() followed by extract_lines().""" - diff_process = compute_diff(commits, files) - changed_lines = extract_lines(diff_process.stdout) - diff_process.stdout.close() - diff_process.wait() - if diff_process.returncode != 0: - # Assume error was already printed to stderr. - sys.exit(2) - return changed_lines - - -def compute_diff(commits, files): - """Return a subprocess object producing the diff from `commits`. - - The return value's `stdin` file object will produce a patch with the - differences between the working directory and the first commit if a single - one was specified, or the difference between both specified commits, filtered - on `files` (if non-empty). Zero context lines are used in the patch.""" - git_tool = 'diff-index' - if len(commits) > 1: - git_tool = 'diff-tree' - cmd = ['git', git_tool, '-p', '-U0'] + commits + ['--'] - cmd.extend(files) - p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) - p.stdin.close() - return p - - -def extract_lines(patch_file): - """Extract the changed lines in `patch_file`. - - The return value is a dictionary mapping filename to a list of (start_line, - line_count) pairs. - - The input must have been produced with ``-U0``, meaning unidiff format with - zero lines of context. The return value is a dict mapping filename to a - list of line `Range`s.""" - matches = {} - for line in patch_file: - line = convert_string(line) - match = re.search(r'^\+\+\+\ [^/]+/(.*)', line) - if match: - filename = match.group(1).rstrip('\r\n') - match = re.search(r'^@@ -[0-9,]+ \+(\d+)(,(\d+))?', line) - if match: - start_line = int(match.group(1)) - line_count = 1 - if match.group(3): - line_count = int(match.group(3)) - if line_count > 0: - matches.setdefault(filename, []).append(Range(start_line, line_count)) - return matches - - -def filter_by_extension(dictionary, allowed_extensions): - """Delete every key in `dictionary` that doesn't have an allowed extension. - - `allowed_extensions` must be a collection of lowercase file extensions, - excluding the period.""" - allowed_extensions = frozenset(allowed_extensions) - for filename in list(dictionary.keys()): - base_ext = filename.rsplit('.', 1) - if len(base_ext) == 1 and '' in allowed_extensions: - continue - if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions: - del dictionary[filename] - - -def cd_to_toplevel(): - """Change to the top level of the git repository.""" - toplevel = run('git', 'rev-parse', '--show-toplevel') - os.chdir(toplevel) - - -def create_tree_from_workdir(filenames): - """Create a new git tree with the given files from the working directory. - - Returns the object ID (SHA-1) of the created tree.""" - return create_tree(filenames, '--stdin') - - -def run_clang_format_and_save_to_tree(changed_lines, revision=None, - binary='clang-format', style=None): - """Run clang-format on each file and save the result to a git tree. - - Returns the object ID (SHA-1) of the created tree.""" - def iteritems(container): - try: - return container.iteritems() # Python 2 - except AttributeError: - return container.items() # Python 3 - def index_info_generator(): - for filename, line_ranges in iteritems(changed_lines): - if revision: - git_metadata_cmd = ['git', 'ls-tree', - '%s:%s' % (revision, os.path.dirname(filename)), - os.path.basename(filename)] - git_metadata = subprocess.Popen(git_metadata_cmd, stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - stdout = git_metadata.communicate()[0] - mode = oct(int(stdout.split()[0], 8)) - else: - mode = oct(os.stat(filename).st_mode) - # Adjust python3 octal format so that it matches what git expects - if mode.startswith('0o'): - mode = '0' + mode[2:] - blob_id = clang_format_to_blob(filename, line_ranges, - revision=revision, - binary=binary, - style=style) - yield '%s %s\t%s' % (mode, blob_id, filename) - return create_tree(index_info_generator(), '--index-info') - - -def create_tree(input_lines, mode): - """Create a tree object from the given input. - - If mode is '--stdin', it must be a list of filenames. If mode is - '--index-info' is must be a list of values suitable for "git update-index - --index-info", such as " ". Any other mode - is invalid.""" - assert mode in ('--stdin', '--index-info') - cmd = ['git', 'update-index', '--add', '-z', mode] - with temporary_index_file(): - p = subprocess.Popen(cmd, stdin=subprocess.PIPE) - for line in input_lines: - p.stdin.write(to_bytes('%s\0' % line)) - p.stdin.close() - if p.wait() != 0: - die('`%s` failed' % ' '.join(cmd)) - tree_id = run('git', 'write-tree') - return tree_id - - -def clang_format_to_blob(filename, line_ranges, revision=None, - binary='clang-format', style=None): - """Run clang-format on the given file and save the result to a git blob. - - Runs on the file in `revision` if not None, or on the file in the working - directory if `revision` is None. - - Returns the object ID (SHA-1) of the created blob.""" - clang_format_cmd = [binary] - if style: - clang_format_cmd.extend(['-style='+style]) - clang_format_cmd.extend([ - '-lines=%s:%s' % (start_line, start_line+line_count-1) - for start_line, line_count in line_ranges]) - if revision: - clang_format_cmd.extend(['-assume-filename='+filename]) - git_show_cmd = ['git', 'cat-file', 'blob', '%s:%s' % (revision, filename)] - git_show = subprocess.Popen(git_show_cmd, stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - git_show.stdin.close() - clang_format_stdin = git_show.stdout - else: - clang_format_cmd.extend([filename]) - git_show = None - clang_format_stdin = subprocess.PIPE - try: - clang_format = subprocess.Popen(clang_format_cmd, stdin=clang_format_stdin, - stdout=subprocess.PIPE) - if clang_format_stdin == subprocess.PIPE: - clang_format_stdin = clang_format.stdin - except OSError as e: - if e.errno == errno.ENOENT: - die('cannot find executable "%s"' % binary) - else: - raise - clang_format_stdin.close() - hash_object_cmd = ['git', 'hash-object', '-w', '--path='+filename, '--stdin'] - hash_object = subprocess.Popen(hash_object_cmd, stdin=clang_format.stdout, - stdout=subprocess.PIPE) - clang_format.stdout.close() - stdout = hash_object.communicate()[0] - if hash_object.returncode != 0: - die('`%s` failed' % ' '.join(hash_object_cmd)) - if clang_format.wait() != 0: - die('`%s` failed' % ' '.join(clang_format_cmd)) - if git_show and git_show.wait() != 0: - die('`%s` failed' % ' '.join(git_show_cmd)) - return convert_string(stdout).rstrip('\r\n') - - -@contextlib.contextmanager -def temporary_index_file(tree=None): - """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting - the file afterward.""" - index_path = create_temporary_index(tree) - old_index_path = os.environ.get('GIT_INDEX_FILE') - os.environ['GIT_INDEX_FILE'] = index_path - try: - yield - finally: - if old_index_path is None: - del os.environ['GIT_INDEX_FILE'] - else: - os.environ['GIT_INDEX_FILE'] = old_index_path - os.remove(index_path) - - -def create_temporary_index(tree=None): - """Create a temporary index file and return the created file's path. - - If `tree` is not None, use that as the tree to read in. Otherwise, an - empty index is created.""" - gitdir = run('git', 'rev-parse', '--git-dir') - path = os.path.join(gitdir, temp_index_basename) - if tree is None: - tree = '--empty' - run('git', 'read-tree', '--index-output='+path, tree) - return path - - -def print_diff(old_tree, new_tree): - """Print the diff between the two trees to stdout.""" - # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output - # is expected to be viewed by the user, and only the former does nice things - # like color and pagination. - # - # We also only print modified files since `new_tree` only contains the files - # that were modified, so unmodified files would show as deleted without the - # filter. - subprocess.check_call(['git', 'diff', '--diff-filter=M', old_tree, new_tree, - '--']) - - -def apply_changes(old_tree, new_tree, force=False, patch_mode=False): - """Apply the changes in `new_tree` to the working directory. - - Bails if there are local changes in those files and not `force`. If - `patch_mode`, runs `git checkout --patch` to select hunks interactively.""" - changed_files = run('git', 'diff-tree', '--diff-filter=M', '-r', '-z', - '--name-only', old_tree, - new_tree).rstrip('\0').split('\0') - if not force: - unstaged_files = run('git', 'diff-files', '--name-status', *changed_files) - if unstaged_files: - print('The following files would be modified but ' - 'have unstaged changes:', file=sys.stderr) - print(unstaged_files, file=sys.stderr) - print('Please commit, stage, or stash them first.', file=sys.stderr) - sys.exit(2) - if patch_mode: - # In patch mode, we could just as well create an index from the new tree - # and checkout from that, but then the user will be presented with a - # message saying "Discard ... from worktree". Instead, we use the old - # tree as the index and checkout from new_tree, which gives the slightly - # better message, "Apply ... to index and worktree". This is not quite - # right, since it won't be applied to the user's index, but oh well. - with temporary_index_file(old_tree): - subprocess.check_call(['git', 'checkout', '--patch', new_tree]) - index_tree = old_tree - else: - with temporary_index_file(new_tree): - run('git', 'checkout-index', '-a', '-f') - return changed_files - - -def run(*args, **kwargs): - stdin = kwargs.pop('stdin', '') - verbose = kwargs.pop('verbose', True) - strip = kwargs.pop('strip', True) - for name in kwargs: - raise TypeError("run() got an unexpected keyword argument '%s'" % name) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - stdin=subprocess.PIPE) - stdout, stderr = p.communicate(input=stdin) - - stdout = convert_string(stdout) - stderr = convert_string(stderr) - - if p.returncode == 0: - if stderr: - if verbose: - print('`%s` printed to stderr:' % ' '.join(args), file=sys.stderr) - print(stderr.rstrip(), file=sys.stderr) - if strip: - stdout = stdout.rstrip('\r\n') - return stdout - if verbose: - print('`%s` returned %s' % (' '.join(args), p.returncode), file=sys.stderr) - if stderr: - print(stderr.rstrip(), file=sys.stderr) - sys.exit(2) - - -def die(message): - print('error:', message, file=sys.stderr) - sys.exit(2) - - -def to_bytes(str_input): - # Encode to UTF-8 to get binary data. - if isinstance(str_input, bytes): - return str_input - return str_input.encode('utf-8') - - -def to_string(bytes_input): - if isinstance(bytes_input, str): - return bytes_input - return bytes_input.encode('utf-8') - - -def convert_string(bytes_input): - try: - return to_string(bytes_input.decode('utf-8')) - except AttributeError: # 'str' object has no attribute 'decode'. - return str(bytes_input) - except UnicodeError: - return str(bytes_input) - -if __name__ == '__main__': - main() diff --git a/scripts/setup-centos9.sh b/scripts/setup-centos9.sh index 13ca2d69d77a..2ea4cf0bc112 100755 --- a/scripts/setup-centos9.sh +++ b/scripts/setup-centos9.sh @@ -86,7 +86,20 @@ function install_gflags { function install_cuda { # See https://developer.nvidia.com/cuda-downloads - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + local arch=$(uname -m) + local repo_url + + if [[ "$arch" == "x86_64" ]]; then + repo_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo" + elif [[ "$arch" == "aarch64" ]]; then + # Using SBSA (Server Base System Architecture) repository for ARM64 servers + repo_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel9/sbsa/cuda-rhel9.repo" + else + echo "Unsupported architecture: $arch" >&2 + return 1 + fi + + dnf config-manager --add-repo "$repo_url" local dashed="$(echo $1 | tr '.' '-')" dnf install -y \ cuda-compat-$dashed \ @@ -196,4 +209,3 @@ function install_velox_deps { dnf clean all fi ) - diff --git a/scripts/setup-classpath.sh b/scripts/setup-classpath.sh index e52184d92138..bfd7066dc63c 100644 --- a/scripts/setup-classpath.sh +++ b/scripts/setup-classpath.sh @@ -1,4 +1,3 @@ -#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/scripts/setup-common.sh b/scripts/setup-common.sh index e2868181e3e1..5e74619953c4 100755 --- a/scripts/setup-common.sh +++ b/scripts/setup-common.sh @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# trigger reinstall SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}") source $SCRIPTDIR/setup-helper-functions.sh diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh index 7f7cdf106400..d5867e736783 100755 --- a/scripts/setup-helper-functions.sh +++ b/scripts/setup-helper-functions.sh @@ -234,4 +234,3 @@ function cmake_install { cmake --build "${BINARY_DIR}" "-j ${NPROC}" || { echo 'build failed' ; exit 1; } ${SUDO} cmake --install "${BINARY_DIR}" } - diff --git a/scripts/setup-macos.sh b/scripts/setup-macos.sh index 209168ca3b37..d9c89e562dcb 100755 --- a/scripts/setup-macos.sh +++ b/scripts/setup-macos.sh @@ -39,7 +39,7 @@ export OS_CXXFLAGS=" -isystem $(brew --prefix)/include " export CMAKE_POLICY_VERSION_MINIMUM="3.5" DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)} -MACOS_VELOX_DEPS="bison flex gflags glog googletest icu4c libevent libsodium lz4 lzo openssl protobuf@21 simdjson snappy xz zstd" +MACOS_VELOX_DEPS="bison flex gflags glog googletest icu4c libevent libsodium lz4 lzo openssl protobuf@21 simdjson snappy xz xxhash zstd" MACOS_BUILD_DEPS="ninja cmake" SUDO="${SUDO:-""}" diff --git a/scripts/setup-manylinux.sh b/scripts/setup-manylinux.sh old mode 100644 new mode 100755 index b82db9263fbb..e781738568ec --- a/scripts/setup-manylinux.sh +++ b/scripts/setup-manylinux.sh @@ -262,7 +262,20 @@ function install_arrow { function install_cuda { # See https://developer.nvidia.com/cuda-downloads - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + local arch=$(uname -m) + local repo_url + + if [[ "$arch" == "x86_64" ]]; then + repo_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo" + elif [[ "$arch" == "aarch64" ]]; then + # Using SBSA (Server Base System Architecture) repository for ARM64 servers + repo_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo" + else + echo "Unsupported architecture: $arch" >&2 + return 1 + fi + + dnf config-manager --add-repo "$repo_url" local dashed="$(echo $1 | tr '.' '-')" dnf install -y cuda-nvcc-$dashed cuda-cudart-devel-$dashed cuda-nvrtc-devel-$dashed cuda-driver-devel-$dashed } diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index 3b27929c1c8f..7b48df0d3be6 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -156,12 +156,37 @@ function install_conda { function install_cuda { # See https://developer.nvidia.com/cuda-downloads + local arch=$(uname -m) + local os_ver + + if [[ ${VERSION} =~ "24.04" ]]; then + os_ver="ubuntu2404" + elif [[ ${VERSION} =~ "22.04" ]]; then + os_ver="ubuntu2204" + elif [[ ${VERSION} =~ "20.04" ]]; then + os_ver="ubuntu2004" + else + echo "Unsupported Ubuntu version: ${VERSION}" >&2 + return 1 + fi + + local cuda_repo + if [[ "$arch" == "x86_64" ]]; then + cuda_repo="${os_ver}/x86_64" + elif [[ "$arch" == "aarch64" ]]; then + cuda_repo="${os_ver}/sbsa" + else + echo "Unsupported architecture: $arch" >&2 + return 1 + fi + if ! dpkg -l cuda-keyring 1>/dev/null; then - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + wget https://developer.download.nvidia.com/compute/cuda/repos/${cuda_repo}/cuda-keyring_1.1-1_all.deb $SUDO dpkg -i cuda-keyring_1.1-1_all.deb rm cuda-keyring_1.1-1_all.deb $SUDO apt update fi + local dashed="$(echo $1 | tr '.' '-')" $SUDO apt install -y \ cuda-compat-$dashed \ diff --git a/scripts/velox_env_linux.yml b/scripts/velox_env_linux.yml index 59ceeb0adb45..58e722cee5ac 100644 --- a/scripts/velox_env_linux.yml +++ b/scripts/velox_env_linux.yml @@ -22,7 +22,7 @@ variables: CXX: clang++ dependencies: -# tools + # tools - binutils - bison - clangxx=14 @@ -37,7 +37,7 @@ dependencies: - openjdk=8.* - python=3.8 - sysroot_linux-64=2.17 -# dependencies + # dependencies - aws-sdk-cpp - azure-identity-cpp - azure-storage-blobs-cpp diff --git a/scripts/velox_env_mac.yml b/scripts/velox_env_mac.yml index 776247a41f08..8c24af8e31a5 100644 --- a/scripts/velox_env_mac.yml +++ b/scripts/velox_env_mac.yml @@ -22,7 +22,7 @@ variables: CXX: clang++ dependencies: -# tools + # tools - binutils - bison - clangxx=14 # pin to something recent'ish to avoid warings on upgrade @@ -36,7 +36,7 @@ dependencies: - openjdk=8.* - python=3.8 - sysroot_linux-64=2.17 -# dependencies + # dependencies - aws-sdk-cpp - azure-identity-cpp - azure-storage-blobs-cpp @@ -64,4 +64,3 @@ dependencies: - xz - zlib - zstd - diff --git a/static/icon.svg b/static/icon.svg index d3e7a794ddc1..8ceed2d92a5d 100644 --- a/static/icon.svg +++ b/static/icon.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/static/logo.svg b/static/logo.svg index 67627d4cfd2b..0db7eec5f1f5 100644 --- a/static/logo.svg +++ b/static/logo.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/velox/common/base/RuntimeMetrics.h b/velox/common/base/RuntimeMetrics.h index 55b4d027da9d..abdb5e9361de 100644 --- a/velox/common/base/RuntimeMetrics.h +++ b/velox/common/base/RuntimeMetrics.h @@ -81,11 +81,10 @@ class BaseRuntimeStatWriter { /// thread. /// NOTE: This is only used by the Velox Driver at the moment, which ensures the /// active Operator is being used by the writer. -void setThreadLocalRunTimeStatWriter( - BaseRuntimeStatWriter* FOLLY_NULLABLE writer); +void setThreadLocalRunTimeStatWriter(BaseRuntimeStatWriter* writer); /// Retrives the current runtime stats writer. -BaseRuntimeStatWriter* FOLLY_NULLABLE getThreadLocalRunTimeStatWriter(); +BaseRuntimeStatWriter* getThreadLocalRunTimeStatWriter(); /// Writes runtime counter to the current Operator running on that thread. void addThreadLocalRuntimeStat( @@ -95,8 +94,7 @@ void addThreadLocalRuntimeStat( /// Scope guard to conveniently set and revert back the current stat writer. class RuntimeStatWriterScopeGuard { public: - explicit RuntimeStatWriterScopeGuard( - BaseRuntimeStatWriter* FOLLY_NULLABLE writer) + explicit RuntimeStatWriterScopeGuard(BaseRuntimeStatWriter* writer) : prevWriter_(getThreadLocalRunTimeStatWriter()) { setThreadLocalRunTimeStatWriter(writer); } @@ -106,7 +104,7 @@ class RuntimeStatWriterScopeGuard { } private: - BaseRuntimeStatWriter* const FOLLY_NULLABLE prevWriter_; + BaseRuntimeStatWriter* const prevWriter_; }; } // namespace facebook::velox diff --git a/velox/common/memory/ArbitrationParticipant.cpp b/velox/common/memory/ArbitrationParticipant.cpp index bc6b4e375172..a5341f521f83 100644 --- a/velox/common/memory/ArbitrationParticipant.cpp +++ b/velox/common/memory/ArbitrationParticipant.cpp @@ -109,8 +109,7 @@ ArbitrationParticipant::ArbitrationParticipant( pool_(pool.get()), config_(config), maxCapacity_(pool_->maxCapacity()), - createTimeNs_(getCurrentTimeNano()), - poolPriority_(pool_->poolPriority()) { + createTimeNs_(getCurrentTimeNano()) { VELOX_CHECK_LE( config_->minCapacity, maxCapacity_, diff --git a/velox/common/memory/ArbitrationParticipant.h b/velox/common/memory/ArbitrationParticipant.h index 4eb7763c8edb..75c9b2a509de 100644 --- a/velox/common/memory/ArbitrationParticipant.h +++ b/velox/common/memory/ArbitrationParticipant.h @@ -173,11 +173,6 @@ class ArbitrationParticipant return config_->minCapacity; } - /// Returns the priority of the underlying query memory pool. - uint32_t poolPriority() const { - return poolPriority_; - } - /// Returns the duration of this arbitration participant since its creation. uint64_t durationNs() const { const auto now = getCurrentTimeNano(); @@ -349,7 +344,6 @@ class ArbitrationParticipant const Config* const config_; const uint64_t maxCapacity_; const uint64_t createTimeNs_; - const uint32_t poolPriority_; mutable std::mutex stateLock_; bool aborted_{false}; diff --git a/velox/common/memory/HashStringAllocator.h b/velox/common/memory/HashStringAllocator.h index fd9fadc1c2b2..0c397b048d79 100644 --- a/velox/common/memory/HashStringAllocator.h +++ b/velox/common/memory/HashStringAllocator.h @@ -685,7 +685,7 @@ struct StlAllocator { explicit StlAllocator(HashStringAllocator* allocator) : allocator_{allocator} { - VELOX_CHECK(allocator); + VELOX_CHECK_NOT_NULL(allocator); } template @@ -722,7 +722,7 @@ struct StlAllocator { } private: - HashStringAllocator* allocator_; + HashStringAllocator* const allocator_; }; /// An allocator backed by HashStringAllocator that guaratees a configurable @@ -833,7 +833,7 @@ struct AlignedStlAllocator { return reinterpret_cast(alignedPtr); } - HashStringAllocator* allocator_; + HashStringAllocator* const allocator_; const bool poolAligned_; }; diff --git a/velox/common/memory/Memory.cpp b/velox/common/memory/Memory.cpp index 3b9f3cbf716a..89e4ee302763 100644 --- a/velox/common/memory/Memory.cpp +++ b/velox/common/memory/Memory.cpp @@ -331,8 +331,7 @@ std::shared_ptr MemoryManager::addRootPool( const std::string& name, int64_t maxCapacity, std::unique_ptr reclaimer, - const std::optional& poolDebugOpts, - uint32_t poolPriority) { + const std::optional& poolDebugOpts) { std::string poolName = name; if (poolName.empty()) { static std::atomic poolId{0}; @@ -346,7 +345,6 @@ std::shared_ptr MemoryManager::addRootPool( options.coreOnAllocationFailureEnabled = coreOnAllocationFailureEnabled_; options.getPreferredSize = getPreferredSize_; options.debugOptions = poolDebugOpts; - options.poolPriority = poolPriority; auto pool = createRootPool(poolName, reclaimer, options); if (!disableMemoryPoolTracking_) { diff --git a/velox/common/memory/Memory.h b/velox/common/memory/Memory.h index 317c32935927..981d64ca80cb 100644 --- a/velox/common/memory/Memory.h +++ b/velox/common/memory/Memory.h @@ -337,8 +337,7 @@ class MemoryManager { int64_t maxCapacity = kMaxMemory, std::unique_ptr reclaimer = nullptr, const std::optional& poolDebugOpts = - std::nullopt, - uint32_t poolPriority = 0); + std::nullopt); /// Creates a leaf memory pool for direct memory allocation use with specified /// 'name'. If 'name' is missing, the memory manager generates a default name diff --git a/velox/common/memory/MemoryPool.cpp b/velox/common/memory/MemoryPool.cpp index f1c01835bebb..5fd1427637cb 100644 --- a/velox/common/memory/MemoryPool.cpp +++ b/velox/common/memory/MemoryPool.cpp @@ -227,7 +227,6 @@ MemoryPool::MemoryPool( trackUsage_(options.trackUsage), threadSafe_(options.threadSafe), debugOptions_(options.debugOptions), - poolPriority_(options.poolPriority), coreOnAllocationFailureEnabled_(options.coreOnAllocationFailureEnabled), getPreferredSize_( options.getPreferredSize == nullptr diff --git a/velox/common/memory/MemoryPool.h b/velox/common/memory/MemoryPool.h index cbb8a86cd23f..8d4175635e99 100644 --- a/velox/common/memory/MemoryPool.h +++ b/velox/common/memory/MemoryPool.h @@ -152,11 +152,6 @@ class MemoryPool : public std::enable_shared_from_this { /// If non-empty, enables debug mode for the created memory pool. std::optional debugOptions{std::nullopt}; - - /// Sets the priority of the memory pool. The priority is used for - /// determining which pools to abort when the system is out of memory. - /// higher poolPriority value respresents higher priority and vice-versa. - uint32_t poolPriority{0}; }; /// Constructs a named memory pool with specified 'name', 'parent' and 'kind'. @@ -302,11 +297,6 @@ class MemoryPool : public std::enable_shared_from_this { return alignment_; } - /// Returns the priority of this pool. - uint32_t poolPriority() const { - return poolPriority_; - } - /// Resource governing methods used to track and limit the memory usage /// through this memory pool object. @@ -556,7 +546,6 @@ class MemoryPool : public std::enable_shared_from_this { const bool trackUsage_; const bool threadSafe_; const std::optional debugOptions_; - const uint32_t poolPriority_; const bool coreOnAllocationFailureEnabled_; std::function getPreferredSize_; diff --git a/velox/common/memory/SharedArbitrator.cpp b/velox/common/memory/SharedArbitrator.cpp index b694b7aa4a75..93c2fd27fb03 100644 --- a/velox/common/memory/SharedArbitrator.cpp +++ b/velox/common/memory/SharedArbitrator.cpp @@ -519,6 +519,7 @@ std::vector SharedArbitrator::getCandidates( return candidates; } +// static void SharedArbitrator::sortCandidatesByReclaimableFreeCapacity( std::vector& candidates) { std::sort( @@ -532,6 +533,7 @@ void SharedArbitrator::sortCandidatesByReclaimableFreeCapacity( &candidates); } +// static void SharedArbitrator::sortCandidatesByReclaimableUsedCapacity( std::vector& candidates) { std::sort( @@ -546,6 +548,43 @@ void SharedArbitrator::sortCandidatesByReclaimableUsedCapacity( &candidates); } +// static +std::vector> +SharedArbitrator::sortAndGroupCandidatesByPriority( + std::vector&& candidates) { + std::sort( + candidates.begin(), + candidates.end(), + [](const ArbitrationCandidate& lhs, const ArbitrationCandidate& rhs) { + const auto* lhsReclaimer = lhs.participant->pool()->reclaimer(); + const auto* rhsReclaimer = rhs.participant->pool()->reclaimer(); + VELOX_CHECK_NOT_NULL(lhsReclaimer); + VELOX_CHECK_NOT_NULL(rhsReclaimer); + return lhsReclaimer->priority() > rhsReclaimer->priority(); + }); + + std::vector> candidateGroups; + int32_t prevPriority; + for (auto i = 0; i < candidates.size(); ++i) { + const auto curPriority = + candidates[i].participant->pool()->reclaimer()->priority(); + if (i == 0) { + prevPriority = curPriority; + candidateGroups.emplace_back( + std::vector{std::move(candidates[i])}); + continue; + } + if (curPriority != prevPriority) { + prevPriority = curPriority; + candidateGroups.emplace_back( + std::vector{std::move(candidates[i])}); + } else { + candidateGroups.back().push_back(std::move(candidates[i])); + } + } + return candidateGroups; +} + std::optional SharedArbitrator::findAbortCandidate( bool force) { auto candidates = getCandidates(); @@ -561,47 +600,35 @@ std::optional SharedArbitrator::findAbortCandidate( return std::nullopt; } - // Returns if other candidate should be chosen for abort. - // With the same capacity size bucket, we favor highest priority followed - // by oldest participant to not to be killed. This allows long running - // highest priority query to proceed first. - auto chooseAnotherCandidateForAbort = [&](const ArbitrationCandidate& current, - const ArbitrationCandidate& other) { - if (current.participant->poolPriority() < - other.participant->poolPriority()) { - return false; - } else if ( - current.participant->poolPriority() == - other.participant->poolPriority() && - current.participant->id() > other.participant->id()) { - return false; - } - return true; - }; - - for (uint64_t capacityLimit : globalArbitrationAbortCapacityLimits_) { - int32_t candidateIdx{-1}; - for (int32_t i = 0; i < candidates.size(); ++i) { - if (candidates[i].participant->aborted()) { - continue; + auto candidateGroups = + sortAndGroupCandidatesByPriority(std::move(candidates)); + + for (auto& candidateGroup : candidateGroups) { + for (uint64_t capacityLimit : globalArbitrationAbortCapacityLimits_) { + int32_t candidateIdx{-1}; + for (int32_t i = 0; i < candidateGroup.size(); ++i) { + if (candidateGroup[i].participant->aborted()) { + continue; + } + if (candidateGroup[i].currentCapacity < capacityLimit || + candidateGroup[i].currentCapacity == 0) { + continue; + } + if (candidateIdx == -1) { + candidateIdx = i; + continue; + } + // With the same capacity size bucket, we favor the old participant to + // not to be killed, to let long running query proceed first. + if (candidateGroup[candidateIdx].participant->id() < + candidateGroup[i].participant->id()) { + candidateIdx = i; + } } - if (candidates[i].currentCapacity < capacityLimit || - candidates[i].currentCapacity == 0) { - continue; - } - if (candidateIdx == -1) { - candidateIdx = i; - continue; - } - - if (chooseAnotherCandidateForAbort( - candidates[candidateIdx], candidates[i])) { - candidateIdx = i; + if (candidateIdx != -1) { + return candidateGroup[candidateIdx]; } } - if (candidateIdx != -1) { - return candidates[candidateIdx]; - } } if (!force) { @@ -609,22 +636,23 @@ std::optional SharedArbitrator::findAbortCandidate( return std::nullopt; } - // Can't find an eligible abort candidate and then return the lowest priority - // youngest candidate which has the largest participant id. - int32_t candidateIdx{-1}; - for (auto i = 0; i < candidates.size(); ++i) { - if (candidateIdx == -1) { - candidateIdx = i; - } else if (chooseAnotherCandidateForAbort( - candidates[candidateIdx], candidates[i])) { + // Can't find an eligible abort candidate and then return the youngest + // candidate (which has the largest participant id) in the lowest priority + // bucket. + VELOX_CHECK(!candidateGroups.empty() && !candidateGroups[0].empty()); + int32_t candidateIdx{0}; + for (auto i = 0; i < candidateGroups[0].size(); ++i) { + if (candidateGroups[0][i].participant->id() > + candidateGroups[0][candidateIdx].participant->id()) { candidateIdx = i; } } - VELOX_CHECK_NE(candidateIdx, -1); + VELOX_MEM_LOG(WARNING) - << "Can't find an eligible abort victim and force to abort the youngest participant " - << candidates[candidateIdx].participant->name(); - return candidates[candidateIdx]; + << "Can't find an eligible abort victim and force to abort the youngest " + "participant " + << candidateGroups[0][candidateIdx].participant->name(); + return candidateGroups[0][candidateIdx]; } void SharedArbitrator::updateArbitrationRequestStats() { @@ -784,7 +812,6 @@ void SharedArbitrator::growCapacity(ArbitrationOperation& op) { } checkIfAborted(op); - checkIfTimeout(op); RETURN_IF_TRUE(maybeGrowFromSelf(op)); @@ -804,6 +831,9 @@ void SharedArbitrator::growCapacity(ArbitrationOperation& op) { succinctBytes(participantConfig_.minReclaimBytes)), op.participant()->pool()); } + + checkIfTimeout(op); + // After failing to acquire enough free capacity to fulfil this capacity // growth request, we will try to reclaim from the participant itself before // failing this operation. We only do this if global memory arbitration is @@ -1231,10 +1261,9 @@ uint64_t SharedArbitrator::reclaimUsedMemoryByAbort(bool force) { VELOX_MEM_POOL_ABORTED(fmt::format( "Memory pool aborted to reclaim used memory, current capacity {}, " "requesting capacity from global arbitration {} memory pool " - "priority:{}\nstats:\n{}\n{}", + "stats:\n{}\n{}", succinctBytes(currentCapacity), succinctBytes(victim.participant->globalArbitrationGrowCapacity()), - victim.participant->pool()->poolPriority(), victim.participant->pool()->toString(), victim.participant->pool()->treeMemoryUsage())); } catch (VeloxRuntimeError&) { diff --git a/velox/common/memory/SharedArbitrator.h b/velox/common/memory/SharedArbitrator.h index 5a0238e460dc..527b564f62cc 100644 --- a/velox/common/memory/SharedArbitrator.h +++ b/velox/common/memory/SharedArbitrator.h @@ -483,6 +483,13 @@ class SharedArbitrator : public memory::MemoryArbitrator { // abort if there is no eligible one. uint64_t reclaimUsedMemoryByAbort(bool force); + // Sorts 'candidates' based on participant's reclaimer priority in descending + // order, putting lower priority ones (with higher priority value) first, and + // high priority ones (with lower priority value) later. + static std::vector> + sortAndGroupCandidatesByPriority( + std::vector&& candidates); + // Finds the participant victim to abort to free used memory based on the // participant's memory capacity and age. The function returns std::nullopt if // there is no eligible candidate. If 'force' is true, it picks up the diff --git a/velox/common/memory/tests/MockSharedArbitratorTest.cpp b/velox/common/memory/tests/MockSharedArbitratorTest.cpp index 732136a4768e..0801fbfd33f9 100644 --- a/velox/common/memory/tests/MockSharedArbitratorTest.cpp +++ b/velox/common/memory/tests/MockSharedArbitratorTest.cpp @@ -92,12 +92,13 @@ class MockTask : public std::enable_shared_from_this { class MemoryReclaimer : public memory::MemoryReclaimer { public: - MemoryReclaimer(const std::shared_ptr& task) - : memory::MemoryReclaimer(0), task_(task) {} + MemoryReclaimer(const std::shared_ptr& task, int32_t priority) + : memory::MemoryReclaimer(priority), task_(task) {} static std::unique_ptr create( - const std::shared_ptr& task) { - return std::make_unique(task); + const std::shared_ptr& task, + int32_t priority) { + return std::make_unique(task, priority); } void abort(MemoryPool* pool, const std::exception_ptr& error) override { @@ -113,16 +114,12 @@ class MockTask : public std::enable_shared_from_this { std::weak_ptr task_; }; - void initTaskPool( - MemoryManager* manager, - uint64_t capacity, - uint32_t taskPriority = 0) { + void + initTaskPool(MemoryManager* manager, uint64_t capacity, int32_t priority) { root_ = manager->addRootPool( fmt::format("RootPool-{}", poolId_++), capacity, - MemoryReclaimer::create(shared_from_this()), - std::nullopt, - taskPriority); + MemoryReclaimer::create(shared_from_this(), priority)); } MemoryPool* pool() const { @@ -516,9 +513,9 @@ class MockSharedArbitrationTest : public testing::Test { std::shared_ptr addTask( int64_t capacity = kMaxMemory, - uint32_t taskPriority = 0) { + int32_t priority = 0) { auto task = std::make_shared(); - task->initTaskPool(manager_.get(), capacity, taskPriority); + task->initTaskPool(manager_.get(), capacity, priority); return task; } @@ -2095,7 +2092,7 @@ TEST_F(MockSharedArbitrationTest, globalArbitrationSmallParticipantLargeGrow) { "Memory pool aborted to reclaim used memory"); } -TEST_F(MockSharedArbitrationTest, globalArbitrationWithMemoryPoolPriority) { +TEST_F(MockSharedArbitrationTest, globalArbitrationWithPriority) { // This test tests global arbitration takes into consideration query priority // attempting to grow capacity when selecting abort partitipants. const int64_t memoryCapacity = 512 << 20; @@ -2119,21 +2116,17 @@ TEST_F(MockSharedArbitrationTest, globalArbitrationWithMemoryPoolPriority) { 5 * 60 * 1'000'000'000UL, true); - // task0 is normal priority with 256MB capacity with initial allocation of - // 256MB - auto task0 = addTask(memoryCapacity / 2, 100); + auto task0 = addTask(384 << 20, 1); auto* op0 = task0->addMemoryOp(false); - op0->allocate(memoryCapacity / 2); + op0->allocate(384 << 20); - // task1 is low priority with 256MB capacity with initial allocation of 256MB - auto task1 = addTask(memoryCapacity / 2, 10); - auto* op1 = task1->addMemoryOp(true); - op1->allocate(memoryCapacity / 2); + auto task1 = addTask(64 << 20, 1); + auto* op1 = task1->addMemoryOp(false); + op1->allocate(64 << 20); - // task2 is normal priority in lower bucket has 256MB capacity with 0 - // allocation - auto task2 = addTask(memoryCapacity / 2, 999); - auto* op2 = task2->addMemoryOp(true); + auto task2 = addTask(64 << 20, 2); + auto* op2 = task2->addMemoryOp(false); + op2->allocate(64 << 20); std::unordered_map runtimeStats; auto statsWriter = std::make_unique(runtimeStats); @@ -2142,24 +2135,20 @@ TEST_F(MockSharedArbitrationTest, globalArbitrationWithMemoryPoolPriority) { // At this point, memory pool is full ASSERT_EQ(manager_->capacity(), manager_->getTotalBytes()); - // Next allocation should succeed with side effect of lowest priority - // query getting killed. - op2->allocate(memoryCapacity / 2); - - ASSERT_EQ( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); - ASSERT_GT(runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ( - runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].sum, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - - // task1 gets aborted since its lowest priority compared to task0 - // task2 is younger in same bucket but survives due to priority. + arbitrator_->shrinkCapacity(64 << 20, false, true); ASSERT_TRUE(task0->error() == nullptr); - ASSERT_TRUE(task1->error() != nullptr); - ASSERT_TRUE(task2->error() == nullptr); + ASSERT_TRUE(task1->error() == nullptr); + VELOX_ASSERT_THROW( + std::rethrow_exception(task2->error()), + "Memory pool aborted to reclaim used memory"); + + arbitrator_->shrinkCapacity(64 << 20, false, true); + VELOX_ASSERT_THROW( + std::rethrow_exception(task0->error()), + "Memory pool aborted to reclaim used memory"); + ASSERT_TRUE(task1->error() == nullptr); + arbitrator_->shrinkCapacity(64 << 20, false, true); VELOX_ASSERT_THROW( std::rethrow_exception(task1->error()), "Memory pool aborted to reclaim used memory"); @@ -3133,7 +3122,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, localArbitrationTimeout) { op->allocate(memoryCapacity / 2); SCOPED_TESTVALUE_SET( - "facebook::velox::memory::ArbitrationParticipant::reclaim", + "facebook::velox::memory::SharedArbitrator::growCapacity", std::function( ([&](const ArbitrationParticipant* /*unused*/) { std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT @@ -3147,8 +3136,8 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, localArbitrationTimeout) { testing::HasSubstr("Memory arbitration timed out on memory pool")); } - // Reclaim happened before timeout check. - ASSERT_EQ(task->capacity(), 0); + // Timeout check happened before reclaim. + ASSERT_EQ(task->capacity(), memoryCapacity / 2); } DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, reclaimLockTimeout) { diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index 91aa8ccc4ba4..e59c0ab72141 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -4040,14 +4040,14 @@ class UnnestNode : public PlanNode { return unnestNames_; } - bool withOrdinality() const { - return ordinalityName_.has_value(); - } - const std::optional& ordinalityName() const { return ordinalityName_; } + bool withOrdinality() const { + return ordinalityName_.has_value(); + } + std::string_view name() const override { return "Unnest"; } diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index 036a7a6d1072..7cca164728d1 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -626,6 +626,18 @@ class QueryConfig { static constexpr const char* kFieldNamesInJsonCastEnabled = "field_names_in_json_cast_enabled"; + /// If this is true, then operators that evaluate expressions will track their + /// stats and return them as part of their operator stats. Tracking these + /// stats can be expensive (especially if operator stats are retrieved + /// frequently) and this allows the user to explicitly enable it. + static constexpr const char* kOperatorTrackExpressionStats = + "operator_track_expression_stats"; + + /// If this is true, then the unnest operator might split output for each + /// input batch based on the output batch size control. Otherwise, it produces + /// a single output for each input batch. + static constexpr const char* kUnnestSplitOutput = "unnest_split_output"; + bool selectiveNimbleReaderEnabled() const { return get(kSelectiveNimbleReaderEnabled, false); } @@ -1142,6 +1154,14 @@ class QueryConfig { return get(kFieldNamesInJsonCastEnabled, false); } + bool operatorTrackExpressionStats() const { + return get(kOperatorTrackExpressionStats, false); + } + + bool unnestSplitOutput() const { + return get(kUnnestSplitOutput, true); + } + template T get(const std::string& key, const T& defaultValue) const { return config_->get(key, defaultValue); diff --git a/velox/docs/bindings/python/arrow.rst b/velox/docs/bindings/python/arrow.rst index 716617a78338..888c91679dd4 100644 --- a/velox/docs/bindings/python/arrow.rst +++ b/velox/docs/bindings/python/arrow.rst @@ -5,4 +5,4 @@ Pyvelox Arrow Api .. autofunction:: pyvelox.arrow.to_velox -.. autofunction:: pyvelox.arrow.to_arrow \ No newline at end of file +.. autofunction:: pyvelox.arrow.to_arrow diff --git a/velox/docs/bindings/python/index.rst b/velox/docs/bindings/python/index.rst index 4bf8155e62c5..7240f77719d4 100644 --- a/velox/docs/bindings/python/index.rst +++ b/velox/docs/bindings/python/index.rst @@ -13,6 +13,3 @@ Pyvelox Documentation runners file legacy - - - diff --git a/velox/docs/bindings/python/legacy.rst b/velox/docs/bindings/python/legacy.rst index f4c8a746a805..dddcdbe2d30e 100644 --- a/velox/docs/bindings/python/legacy.rst +++ b/velox/docs/bindings/python/legacy.rst @@ -9,4 +9,4 @@ Pyvelox Legacy Api .. autoclass:: pyvelox.legacy::BaseVector :members: - :special-members: \ No newline at end of file + :special-members: diff --git a/velox/docs/bindings/python/runners.rst b/velox/docs/bindings/python/runners.rst index a8f7251aae42..401b2f452611 100644 --- a/velox/docs/bindings/python/runners.rst +++ b/velox/docs/bindings/python/runners.rst @@ -5,4 +5,4 @@ Pyvelox Runners .. autoclass:: pyvelox.runner.LocalRunner :members: - :special-members: \ No newline at end of file + :special-members: diff --git a/velox/docs/bindings/python/vector.rst b/velox/docs/bindings/python/vector.rst index 4c8005b76aac..b4fca9604c71 100644 --- a/velox/docs/bindings/python/vector.rst +++ b/velox/docs/bindings/python/vector.rst @@ -5,4 +5,4 @@ Pyvelox Vectors .. autoclass:: pyvelox.vector.Vector :members: - :special-members: \ No newline at end of file + :special-members: diff --git a/velox/docs/conf.py b/velox/docs/conf.py index 1fecf2f68e61..d9dd62e7450b 100644 --- a/velox/docs/conf.py +++ b/velox/docs/conf.py @@ -30,7 +30,7 @@ try: sys.dont_write_bytecode = True -except: +except: # noqa E722 pass sys.path.insert(0, os.path.abspath("ext")) diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index e9c362e9c284..3bf86b52b331 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -167,6 +167,11 @@ Generic Configuration - 0 - Specifies the max number of input batches to prefetch to do index lookup ahead. If it is zero, then process one input batch at a time. + * - unnest_split_output_batch + - bool + - true + - If this is true, then the unnest operator might split output for each input batch based on the + output batch size control. Otherwise, it produces a single output for each input batch. .. _expression-evaluation-conf: diff --git a/velox/docs/credits.py b/velox/docs/credits.py old mode 100644 new mode 100755 diff --git a/velox/docs/develop/aggregations.rst b/velox/docs/develop/aggregations.rst index bb776e9cefe0..934252f7c14a 100644 --- a/velox/docs/develop/aggregations.rst +++ b/velox/docs/develop/aggregations.rst @@ -118,9 +118,9 @@ Push-Down into Table Scan HashAggregation operator supports pushing down aggregations into table scan. Pushdown is enabled when all of the following conditions are met: -* the aggregation function takes a single argument, -* the argument is a column read directly from the table without any transformations, -* that column is not used anywhere else in the query. +* the aggregation function takes a single argument, +* the argument is a column read directly from the table without any transformations, +* that column is not used anywhere else in the query. For example, pushdown is possible in the following query: @@ -134,9 +134,9 @@ enabled in the following query: .. code-block:: sql - SELECT a, sum(b) - FROM t - WHERE a > 100 + SELECT a, sum(b) + FROM t + WHERE a > 100 GROUP BY 1 In these queries, TableScan operator produces "b" column as a LazyVector @@ -294,7 +294,7 @@ After receiving at least abandon_partial_aggregation_min_rows input rows, the operator checks the percentage of input rows that are unique, e.g. compares number of groups with number of input rows. If percentage of unique rows exceeds abandon_partial_aggregation_min_pct, the operator abandons partial -aggregation. +aggregation. It is not possible to simply stop aggregating inputs and pass these as is to shuffle and final aggregation because final aggregation expects data type that diff --git a/velox/docs/develop/debugging/print-plan-with-stats.rst b/velox/docs/develop/debugging/print-plan-with-stats.rst index 547feb93d475..3bb482763c6f 100644 --- a/velox/docs/develop/debugging/print-plan-with-stats.rst +++ b/velox/docs/develop/debugging/print-plan-with-stats.rst @@ -283,4 +283,3 @@ TableScan operator shows how many rows were processed by pushing down aggregatio .. code-block:: loadedToValueHook sum: 50000, count: 5, min: 10000, max: 10000 - diff --git a/velox/docs/develop/dynamic-loading.rst b/velox/docs/develop/dynamic-loading.rst index c6ff23420da9..d720eee5ce3a 100644 --- a/velox/docs/develop/dynamic-loading.rst +++ b/velox/docs/develop/dynamic-loading.rst @@ -12,7 +12,7 @@ Getting Started 1. **Create a C++ file for your dynamic library** - For dynamically loaded function registration, the format followed mirrors that of built-in function registration with some noted differences. Using `DynamicTestFunction.cpp` as an example, the function uses the `extern "C"` keyword to protect against name mangling. + For dynamically loaded function registration, the format followed mirrors that of built-in function registration with some noted differences. Using `DynamicTestFunction.cpp` as an example, the function uses the `extern "C"` keyword to protect against name mangling. The `registrationFunctionName` function here acts as the entrypoint for the dynamic library for loading symbols. The `registrationFunctionName` function name is customizable and defaults to `registerExtensions` when not specified in the library loading call. Make sure to also include the necessary header file: @@ -77,4 +77,3 @@ Notes - In Velox, a function's signature is determined solely by its name and argument types. The return type is not taken into account. As a result, if a function with an identical signature is added but with a different return type, it will overwrite the existing function. - Function overloading is supported. Therefore, multiple functions can share the same name as long as they differ in the number or types of arguments. - diff --git a/velox/docs/develop/operators.rst b/velox/docs/develop/operators.rst index b8bbf02fa3e1..9af21bcdacd1 100644 --- a/velox/docs/develop/operators.rst +++ b/velox/docs/develop/operators.rst @@ -316,7 +316,7 @@ constructor within the Project operation. * - names - A list of new column names. -ExpandNode is typically used to compute GROUPING SETS, CUBE, ROLLUP and COUNT DISTINCT. +ExpandNode is typically used to compute GROUPING SETS, CUBE, ROLLUP and COUNT DISTINCT. To illustrate how ExpandNode works lets examine the following SQL query: @@ -347,7 +347,7 @@ After the computation by the ExpandNode, each row will generate 3 rows of data. .. code-block:: - l_suppkey l_orderkey l_partkey grouping_id_0 + l_suppkey l_orderkey l_partkey grouping_id_0 93 1 673 0 93 1 null 1 93 null null 3 @@ -389,15 +389,15 @@ For example, if the input rows are: .. code-block:: l_suppkey l_partkey - 93 673 - 75 674 + 93 673 + 75 674 38 22 After the computation by the ExpandNode, each row will generate 2 rows of data. So there will be a total of 6 rows: .. code-block:: - l_suppkey l_partkey grouping_id_0 + l_suppkey l_partkey grouping_id_0 93 null 1 null 673 2 75 null 1 @@ -409,7 +409,7 @@ Aggregation operator that follows, groups these rows by (l_suppkey, l_partkey, g .. code-block:: - l_suppkey l_partkey grouping_id_0 + l_suppkey l_partkey grouping_id_0 93 null 1 75 null 1 38 null 1 diff --git a/velox/docs/develop/testing/memory-arbitration-fuzzer.rst b/velox/docs/develop/testing/memory-arbitration-fuzzer.rst index 2895138a6faf..b4a15c892450 100644 --- a/velox/docs/develop/testing/memory-arbitration-fuzzer.rst +++ b/velox/docs/develop/testing/memory-arbitration-fuzzer.rst @@ -19,11 +19,11 @@ It works as follows: How to run ---------- -Use velox_memory_arbitration_fuzzer_test binary to run this fuzzer: +Use velox_memory_arbitration_fuzzer binary to run this fuzzer: :: - velox/exec/tests/velox_memory_arbitration_fuzzer_test --seed 123 --duration_sec 60 + velox/exec/tests/velox_memory_arbitration_fuzzer --seed 123 --duration_sec 60 By default, the fuzzer will go through 10 iterations. Use --steps or --duration-sec flag to run fuzzer for longer. Use --seed to diff --git a/velox/docs/ext/issue.py b/velox/docs/ext/issue.py index ac95246c6077..b80be34c95ad 100644 --- a/velox/docs/ext/issue.py +++ b/velox/docs/ext/issue.py @@ -13,7 +13,7 @@ # limitations under the License. # noinspection PyUnresolvedReferences -from docutils import nodes, utils +from docutils import nodes # noinspection PyDefaultArgument,PyUnusedLocal diff --git a/velox/docs/ext/pr.py b/velox/docs/ext/pr.py index bba2e50dd1d2..745499b262cf 100644 --- a/velox/docs/ext/pr.py +++ b/velox/docs/ext/pr.py @@ -13,7 +13,7 @@ # limitations under the License. # noinspection PyUnresolvedReferences -from docutils import nodes, utils +from docutils import nodes # noinspection PyDefaultArgument,PyUnusedLocal diff --git a/velox/docs/ext/spark.py b/velox/docs/ext/spark.py index eede63543741..134a882db99f 100644 --- a/velox/docs/ext/spark.py +++ b/velox/docs/ext/spark.py @@ -653,7 +653,7 @@ def add_target_and_index( text = _("%s() (in module %s)") % (name, modname) self.indexnode["entries"].append(("single", text, node_id, "", None)) else: - text = f'{pairindextypes["builtin"]}; {name}()' + text = f"{pairindextypes['builtin']}; {name}()" self.indexnode["entries"].append(("pair", text, node_id, "", None)) def get_index_text(self, modname: str, name_cls: tuple[str, str]) -> str | None: diff --git a/velox/docs/functions/presto/aggregate.rst b/velox/docs/functions/presto/aggregate.rst index f07a2b2b4045..84e91d48a1c1 100644 --- a/velox/docs/functions/presto/aggregate.rst +++ b/velox/docs/functions/presto/aggregate.rst @@ -726,6 +726,15 @@ Noisy Aggregate Functions Unlike :func:`!count_if`, this function returns ``NULL`` when the (true) count is 0. +.. function:: noisy_count_gaussian(col, noise_scale) -> bigint + + Counts the non-null values in ``col`` and then adds a normally distributed random double + value with 0 mean and standard deviation of ``noise_scale`` to the true count. + The noisy count is post-processed to be non-negative and rounded to bigint. + + :: + SELECT noisy_count_gaussian(orderkey, 20.0) FROM tpch.tiny.lineitem; -- 60181 (1 row) + SELECT noisy_count_gaussian(orderkey, 20.0) FROM tpch.tiny.lineitem WHERE false; -- NULL (1 row) Miscellaneous ------------- diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst index 334b8d914310..1e9ee252b135 100644 --- a/velox/docs/functions/presto/binary.rst +++ b/velox/docs/functions/presto/binary.rst @@ -77,12 +77,12 @@ Binary Functions .. function:: lpad(binary, size, padbinary) -> varbinary :noindex: - + Left pads ``binary`` to ``size`` bytes with ``padbinary``. If ``size`` is less than the length of ``binary``, the result is truncated to ``size`` characters. ``size`` must not be negative and ``padbinary`` must be non-empty. ``size`` has a maximum value of 1 MiB. - In the case of ``size`` being smaller than the length of ``binary``, + In the case of ``size`` being smaller than the length of ``binary``, ``binary`` will be truncated from the right to fit the ``size``. .. function:: md5(binary) -> varbinary @@ -96,9 +96,9 @@ Binary Functions If ``size`` is less than the length of ``binary``, the result is truncated to ``size`` characters. ``size`` must not be negative and ``padbinary`` must be non-empty. ``size`` has a maximum value of 1 MiB. - In the case of ``size`` being smaller than the length of ``binary``, + In the case of ``size`` being smaller than the length of ``binary``, ``binary`` will be truncated from the right to fit the ``size``. - + .. function:: sha1(binary) -> varbinary Computes the SHA-1 hash of ``binary``. diff --git a/velox/docs/functions/presto/geospatial.rst b/velox/docs/functions/presto/geospatial.rst index 743cdf03fbfb..754ef58c0e37 100644 --- a/velox/docs/functions/presto/geospatial.rst +++ b/velox/docs/functions/presto/geospatial.rst @@ -188,6 +188,15 @@ Accessors Returns the point value that is the mathematical centroid of ``geometry``. Empty geometry inputs result in empty output. +.. function:: ST_Distance(geometry1: Geometry, geometry2: Geometry) -> distance: double + + Returns the 2-dimensional cartesian minimum distance (based on spatial ref) + between two geometries in projected units. Empty geometries result in null output. + +.. function:: ST_GeometryType(geometry: Geometry) -> type: varchar + + Returns the type of the geometry. + .. function:: ST_X(geometry: Geometry) -> x: double Returns the ``x`` coordinate of the geometry if it is a Point. Returns diff --git a/velox/docs/functions/presto/window.rst b/velox/docs/functions/presto/window.rst index 6d25073cb678..56c6b90bbee4 100644 --- a/velox/docs/functions/presto/window.rst +++ b/velox/docs/functions/presto/window.rst @@ -168,4 +168,4 @@ Aggregate functions ___________________ All aggregate functions can be used as window functions by adding the OVER clause. The aggregate function is computed -for each row over the rows within the current row's window frame. \ No newline at end of file +for each row over the rows within the current row's window frame. diff --git a/velox/docs/functions/spark/aggregate.rst b/velox/docs/functions/spark/aggregate.rst index 3abf81411eb1..a9ac2bd001c5 100644 --- a/velox/docs/functions/spark/aggregate.rst +++ b/velox/docs/functions/spark/aggregate.rst @@ -26,7 +26,7 @@ General Aggregate Functions Creates bloom filter from input hashes and returns it serialized into VARBINARY. The caller is expected to apply xxhash64 function to input data before calling bloom_filter_agg. - For example, + For example, bloom_filter_agg(xxhash64(x), 100, 1024) In Spark implementation, ``estimatedNumItems`` and ``numBits`` are used to decide the number of hash functions and bloom filter capacity. In Velox implementation, ``estimatedNumItems`` is not used. @@ -48,7 +48,7 @@ General Aggregate Functions But Spark allows for changing the defaults while Velox does not. .. spark:function:: bloom_filter_agg(hash) -> varbinary - + A version of ``bloom_filter_agg`` that use the value of spark.bloom_filter.max_num_bits configuration property as ``numBits``. ``hash`` cannot be null. @@ -162,11 +162,11 @@ General Aggregate Functions Returns the most frequent value for the values within ``x``. NULL values are ignored. If all the values are NULL, or there are 0 rows, returns NULL. - If multiple values have the same greatest frequency, the + If multiple values have the same greatest frequency, the return value could be any one of them. Example:: - + SELECT mode(x) FROM ( VALUES diff --git a/velox/docs/functions/spark/array.rst b/velox/docs/functions/spark/array.rst index 09eedad9fd21..88689787c3b4 100644 --- a/velox/docs/functions/spark/array.rst +++ b/velox/docs/functions/spark/array.rst @@ -262,7 +262,7 @@ Array Functions .. spark:function:: shuffle(array(E), seed) -> array(E) - Generates a random permutation of the given ``array`` using a seed derived + Generates a random permutation of the given ``array`` using a seed derived from the parameter ``seed`` and the configuration `spark.partition_id`. ``seed`` must be constant. :: diff --git a/velox/docs/functions/spark/binary.rst b/velox/docs/functions/spark/binary.rst index 48ce669467f5..ab594134d870 100644 --- a/velox/docs/functions/spark/binary.rst +++ b/velox/docs/functions/spark/binary.rst @@ -32,9 +32,9 @@ Binary Functions .. spark:function:: might_contain(bloomFilter, value) -> boolean - Returns TRUE if ``bloomFilter`` might contain ``value``. + Returns TRUE if ``bloomFilter`` might contain ``value``. - ``bloomFilter`` is a VARBINARY computed using ::spark:function::`bloom_filter_agg` aggregate function. + ``bloomFilter`` is a VARBINARY computed using ::spark:function::`bloom_filter_agg` aggregate function. ``value`` is a BIGINT. .. spark:function:: sha1(x) -> varchar diff --git a/velox/docs/functions/spark/bitwise.rst b/velox/docs/functions/spark/bitwise.rst index 5924b36aa428..778284864249 100644 --- a/velox/docs/functions/spark/bitwise.rst +++ b/velox/docs/functions/spark/bitwise.rst @@ -4,7 +4,7 @@ Bitwise Functions .. spark:function:: bitwise_and(x, y) -> [same as input] - Returns the bitwise AND of ``x`` and ``y`` in 2's complement representation. + Returns the bitwise AND of ``x`` and ``y`` in 2's complement representation. Corresponds to Spark's operator ``&``. Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT. @@ -47,4 +47,4 @@ Bitwise Functions .. spark:function:: shiftright(x, n) -> [same as x] - Returns x bitwise right shifted by n bits. Supported types for 'x' are INTEGER and BIGINT. \ No newline at end of file + Returns x bitwise right shifted by n bits. Supported types for 'x' are INTEGER and BIGINT. diff --git a/velox/docs/functions/spark/comparison.rst b/velox/docs/functions/spark/comparison.rst index 9b62e68c918b..7dc953930299 100644 --- a/velox/docs/functions/spark/comparison.rst +++ b/velox/docs/functions/spark/comparison.rst @@ -27,7 +27,7 @@ Comparison Functions Returns true if x is equal to y. Supports all scalar and complex types. The types of x and y must be the same. Corresponds to Spark's operators ``=`` and ``==``. Returns NULL for any NULL input, but nested nulls are compared as values. :: - + SELECT equalto(null, null); -- null SELECT equalto(null, ARRAY[1]); -- null SELECT equalto(ARRAY[1, null], ARRAY[1, null]); -- true @@ -44,7 +44,7 @@ Comparison Functions .. spark:function:: greatest(value1, value2, ..., valueN) -> [same as input] - Returns the largest of the provided values ignoring nulls. Supports all scalar types. + Returns the largest of the provided values ignoring nulls. Supports all scalar types. The types of all arguments must be the same. :: SELECT greatest(10, 9, 2, 4, 3); -- 10 diff --git a/velox/docs/functions/spark/coverage.rst b/velox/docs/functions/spark/coverage.rst index 4bacd7f1ec29..f721415c82ab 100644 --- a/velox/docs/functions/spark/coverage.rst +++ b/velox/docs/functions/spark/coverage.rst @@ -81,73 +81,73 @@ ========================================= ========================================= ========================================= ========================================= ========================================= == ========================================= == ========================================= Scalar Functions Aggregate Functions Window Functions ===================================================================================================================================================================================================================== == ========================================= == ========================================= - :spark:func:`abs` count_if inline nvl sqrt any cume_dist - :spark:func:`acos count_min_sketch inline_outer nvl2 stack approx_count_distinct dense_rank - :spark:func:`acosh` covar_pop input_file_block_length octet_length std approx_percentile first_value - add_months covar_samp input_file_block_start or stddev array_agg lag - :spark:func:`aggregate` crc32 input_file_name overlay stddev_pop avg last_value - and cume_dist :spark:func:`instr` parse_url stddev_samp bit_and lead - any current_catalog int percent_rank str_to_map bit_or :spark:func:`nth_value` - approx_count_distinct current_database isnan percentile string :spark:func:`bit_xor` ntile - approx_percentile current_date :spark:func:`isnotnull` percentile_approx struct bool_and percent_rank - :spark:func:`array` current_timestamp :spark:func:`isnull` pi substr bool_or rank - :spark:func:`array_contains` current_timezone java_method :spark:func:`pmod` :spark:func:`substring` collect_list row_number - array_distinct current_user json_array_length posexplode substring_index collect_set - array_except date json_object_keys posexplode_outer sum corr - :spark:func:`array_intersect` date_add json_tuple position tan count - array_join date_format kurtosis positive tanh count_if - array_max date_from_unix_date lag pow timestamp count_min_sketch - array_min date_part last :spark:func:`power` timestamp_micros covar_pop - array_position date_sub last_day printf timestamp_millis covar_samp - array_remove date_trunc last_value quarter timestamp_seconds every - array_repeat datediff lcase radians tinyint :spark:func:`first` - :spark:func:`array_sort` day lead raise_error to_csv first_value - array_union dayofmonth :spark:func:`least` :spark:func:`rand` to_date grouping - arrays_overlap dayofweek :spark:func:`left` randn to_json grouping_id - arrays_zip dayofyear :spark:func:`length` random to_timestamp histogram_numeric - :spark:func:`ascii` decimal levenshtein range :spark:func:`to_unix_timestamp` kurtosis - asin decode like rank to_utc_timestamp :spark:func:`last` - :spark:func:`asinh` degrees ln reflect :spark:func:`transform` last_value - assert_true dense_rank locate regexp transform_keys max - atan div log :spark:func:`regexp_extract` transform_values max_by - atan2 double log10 regexp_extract_all translate mean - :spark:func:`atanh` e :spark:func:`log1p` regexp_like :spark:func:`trim` min - avg :spark:func:`element_at` log2 regexp_replace trunc min_by - base64 elt :spark:func:`lower` repeat try_add percentile - :spark:func:`between` encode lpad :spark:func:`replace` try_divide percentile_approx - bigint every :spark:func:`ltrim` reverse typeof regr_avgx - :spark:func:`bin` exists make_date right ucase regr_avgy - binary :spark:func:`exp` make_dt_interval rint unbase64 regr_count - bit_and explode make_interval :spark:func:`rlike` unhex regr_r2 - bit_count explode_outer make_timestamp :spark:func:`round` unix_date skewness - bit_get expm1 make_ym_interval row_number unix_micros some - bit_length extract :spark:func:`map` rpad unix_millis std - bit_or factorial map_concat :spark:func:`rtrim` unix_seconds stddev - bit_xor :spark:func:`filter` map_entries schema_of_csv :spark:func:`unix_timestamp` stddev_pop - bool_and find_in_set :spark:func:`map_filter` schema_of_json :spark:func:`upper` stddev_samp - bool_or first :spark:func:`map_from_arrays` second uuid sum - boolean first_value map_from_entries sentences var_pop try_avg - bround flatten map_keys sequence var_samp try_sum - btrim float map_values session_window variance var_pop - cardinality :spark:func:`floor` map_zip_with sha version var_samp - case forall max :spark:func:`sha1` weekday variance - cast format_number max_by :spark:func:`sha2` weekofyear - cbrt format_string :spark:func:`md5` :spark:func:`shiftleft` when - :spark:func:`ceil` from_csv mean :spark:func:`shiftright` width_bucket - ceiling from_json min shiftrightunsigned window - char from_unixtime min_by shuffle xpath - char_length from_utc_timestamp minute sign xpath_boolean - character_length :spark:func:`get_json_object` mod signum xpath_double - :spark:func:`chr` getbit monotonically_increasing_id sin xpath_float - coalesce :spark:func:`greatest` month :spark:func:`sinh` xpath_int - collect_list grouping months_between :spark:func:`size` xpath_long - collect_set grouping_id named_struct skewness xpath_number - :spark:func:`concat` :spark:func:`hash` nanvl slice xpath_short - concat_ws hex negative smallint xpath_string - conv hour next_day some :spark:func:`xxhash64` - corr :spark:func:`hypot` :spark:func:`not` :spark:func:`sort_array` :spark:func:`year` - cos if now soundex zip_with - cosh ifnull nth_value space - cot :spark:func:`in` ntile spark_partition_id - count initcap nullif :spark:func:`split` + :spark:func:`abs` count_if inline nvl sqrt any cume_dist + :spark:func:`acos count_min_sketch inline_outer nvl2 stack approx_count_distinct dense_rank + :spark:func:`acosh` covar_pop input_file_block_length octet_length std approx_percentile first_value + add_months covar_samp input_file_block_start or stddev array_agg lag + :spark:func:`aggregate` crc32 input_file_name overlay stddev_pop avg last_value + and cume_dist :spark:func:`instr` parse_url stddev_samp bit_and lead + any current_catalog int percent_rank str_to_map bit_or :spark:func:`nth_value` + approx_count_distinct current_database isnan percentile string :spark:func:`bit_xor` ntile + approx_percentile current_date :spark:func:`isnotnull` percentile_approx struct bool_and percent_rank + :spark:func:`array` current_timestamp :spark:func:`isnull` pi substr bool_or rank + :spark:func:`array_contains` current_timezone java_method :spark:func:`pmod` :spark:func:`substring` collect_list row_number + array_distinct current_user json_array_length posexplode substring_index collect_set + array_except date json_object_keys posexplode_outer sum corr + :spark:func:`array_intersect` date_add json_tuple position tan count + array_join date_format kurtosis positive tanh count_if + array_max date_from_unix_date lag pow timestamp count_min_sketch + array_min date_part last :spark:func:`power` timestamp_micros covar_pop + array_position date_sub last_day printf timestamp_millis covar_samp + array_remove date_trunc last_value quarter timestamp_seconds every + array_repeat datediff lcase radians tinyint :spark:func:`first` + :spark:func:`array_sort` day lead raise_error to_csv first_value + array_union dayofmonth :spark:func:`least` :spark:func:`rand` to_date grouping + arrays_overlap dayofweek :spark:func:`left` randn to_json grouping_id + arrays_zip dayofyear :spark:func:`length` random to_timestamp histogram_numeric + :spark:func:`ascii` decimal levenshtein range :spark:func:`to_unix_timestamp` kurtosis + asin decode like rank to_utc_timestamp :spark:func:`last` + :spark:func:`asinh` degrees ln reflect :spark:func:`transform` last_value + assert_true dense_rank locate regexp transform_keys max + atan div log :spark:func:`regexp_extract` transform_values max_by + atan2 double log10 regexp_extract_all translate mean + :spark:func:`atanh` e :spark:func:`log1p` regexp_like :spark:func:`trim` min + avg :spark:func:`element_at` log2 regexp_replace trunc min_by + base64 elt :spark:func:`lower` repeat try_add percentile + :spark:func:`between` encode lpad :spark:func:`replace` try_divide percentile_approx + bigint every :spark:func:`ltrim` reverse typeof regr_avgx + :spark:func:`bin` exists make_date right ucase regr_avgy + binary :spark:func:`exp` make_dt_interval rint unbase64 regr_count + bit_and explode make_interval :spark:func:`rlike` unhex regr_r2 + bit_count explode_outer make_timestamp :spark:func:`round` unix_date skewness + bit_get expm1 make_ym_interval row_number unix_micros some + bit_length extract :spark:func:`map` rpad unix_millis std + bit_or factorial map_concat :spark:func:`rtrim` unix_seconds stddev + bit_xor :spark:func:`filter` map_entries schema_of_csv :spark:func:`unix_timestamp` stddev_pop + bool_and find_in_set :spark:func:`map_filter` schema_of_json :spark:func:`upper` stddev_samp + bool_or first :spark:func:`map_from_arrays` second uuid sum + boolean first_value map_from_entries sentences var_pop try_avg + bround flatten map_keys sequence var_samp try_sum + btrim float map_values session_window variance var_pop + cardinality :spark:func:`floor` map_zip_with sha version var_samp + case forall max :spark:func:`sha1` weekday variance + cast format_number max_by :spark:func:`sha2` weekofyear + cbrt format_string :spark:func:`md5` :spark:func:`shiftleft` when + :spark:func:`ceil` from_csv mean :spark:func:`shiftright` width_bucket + ceiling from_json min shiftrightunsigned window + char from_unixtime min_by shuffle xpath + char_length from_utc_timestamp minute sign xpath_boolean + character_length :spark:func:`get_json_object` mod signum xpath_double + :spark:func:`chr` getbit monotonically_increasing_id sin xpath_float + coalesce :spark:func:`greatest` month :spark:func:`sinh` xpath_int + collect_list grouping months_between :spark:func:`size` xpath_long + collect_set grouping_id named_struct skewness xpath_number + :spark:func:`concat` :spark:func:`hash` nanvl slice xpath_short + concat_ws hex negative smallint xpath_string + conv hour next_day some :spark:func:`xxhash64` + corr :spark:func:`hypot` :spark:func:`not` :spark:func:`sort_array` :spark:func:`year` + cos if now soundex zip_with + cosh ifnull nth_value space + cot :spark:func:`in` ntile spark_partition_id + count initcap nullif :spark:func:`split` ========================================= ========================================= ========================================= ========================================= ========================================= == ========================================= == ========================================= diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index aef9f3404780..9a3b56d11302 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -310,6 +310,26 @@ These functions support TIMESTAMP and DATE input types. SELECT to_utc_timestamp('2015-07-24 00:00:00', 'America/Los_Angeles'); -- '2015-07-24 07:00:00' +.. spark:function:: trunc(date, fmt) -> date + + Returns ``date`` truncated to the unit specified by the format model ``fmt``. + Returns NULL if ``fmt`` is invalid. + + ``fmt`` is case insensitive and must be one of the following: + * "YEAR", "YYYY", "YY" - truncate to the first date of the year that the ``date`` falls in + * "QUARTER" - truncate to the first date of the quarter that the ``date`` falls in + * "MONTH", "MM", "MON" - truncate to the first date of the month that the ``date`` falls in + * "WEEK" - truncate to the Monday of the week that the ``date`` falls in + + :: + + SELECT trunc('2019-08-04', 'week'); -- 2019-07-29 + SELECT trunc('2019-08-04', 'quarter'); -- 2019-07-01 + SELECT trunc('2009-02-12', 'MM'); -- 2009-02-01 + SELECT trunc('2015-10-27', 'YEAR'); -- 2015-01-01 + SELECT trunc('2015-10-27', ''); -- NULL + SELECT trunc('2015-10-27', 'day'); -- NULL + .. spark:function:: unix_date(date) -> integer Returns the number of days since 1970-01-01. :: @@ -428,4 +448,3 @@ returned for invalid format; otherwise, exception is thrown. :: SELECT from_unixtime(100, '!@#$%^&*'); -- throws exception) (for Joda date formatter) SELECT get_timestamp('1970-01-01', '!@#$%^&*'); -- NULL (parsing error) (for Simple date formatter) SELECT get_timestamp('1970-01-01', '!@#$%^&*'); -- throws exception) (for Joda date formatter) - diff --git a/velox/docs/functions/spark/decimal.rst b/velox/docs/functions/spark/decimal.rst index 630d580346f8..3d1c1c18fc23 100644 --- a/velox/docs/functions/spark/decimal.rst +++ b/velox/docs/functions/spark/decimal.rst @@ -160,12 +160,12 @@ Decimal Special Forms Returns ``decimal`` rounded to a new scale using HALF_UP rounding mode. In HALF_UP rounding, the digit 5 is rounded up. ``scale`` is the new scale to be rounded to. It is 0 by default, and integer in [INT_MIN, INT_MAX] is allowed to be its value. - When the absolute value of scale exceeds the maximum precision of long decimal (38), the round logic is equivalent to the case where it is 38 as we cannot exceed the maximum precision. + When the absolute value of scale exceeds the maximum precision of long decimal (38), the round logic is equivalent to the case where it is 38 as we cannot exceed the maximum precision. The result precision and scale are decided with the precision and scale of input ``decimal`` and ``scale``. After rounding we may need one more digit in the integral part. - + :: - + SELECT (round(cast (9.9 as decimal(2, 1)), 0)); -- decimal 10 SELECT (round(cast (99 as decimal(2, 0)), -1)); -- decimal 100 diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst index d06a77b8eeee..5ffe3af0fca5 100644 --- a/velox/docs/functions/spark/json.rst +++ b/velox/docs/functions/spark/json.rst @@ -21,17 +21,17 @@ JSON Functions .. spark:function:: from_json(jsonString) -> array / map / row - Casts ``jsonString`` to an ARRAY, MAP, or ROW type, with the output type + Casts ``jsonString`` to an ARRAY, MAP, or ROW type, with the output type determined by the expression. Returns NULL, if the input string is unparsable. - Supported element types include BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, - REAL, DOUBLE, DATE, VARCHAR, ARRAY, MAP and ROW. When casting to ARRAY or MAP, - the element type of the array or the value type of the map must be one of - these supported types, and for maps, the key type must be VARCHAR. Casting - to ROW supports only JSON objects. - Note that since the result type can be inferred from the expression, in Velox we - do not need to provide the ``schema`` parameter as required by Spark's from_json + Supported element types include BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, + REAL, DOUBLE, DATE, VARCHAR, ARRAY, MAP and ROW. When casting to ARRAY or MAP, + the element type of the array or the value type of the map must be one of + these supported types, and for maps, the key type must be VARCHAR. Casting + to ROW supports only JSON objects. + Note that since the result type can be inferred from the expression, in Velox we + do not need to provide the ``schema`` parameter as required by Spark's from_json function. :: - + SELECT from_json('{"a": true}', 'a BOOLEAN'); -- {'a'=true} SELECT from_json('{"a": 1}', 'a INT'); -- {'a'=1} SELECT from_json('{"a": 1.0}', 'a DOUBLE'); -- {'a'=1.0} @@ -53,7 +53,7 @@ JSON Functions * Does not support schemas that include a corrupt record column, for example, the Spark function below is not supported. :: - from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE, _corrupt_record STRING') + from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE, _corrupt_record STRING') .. spark:function:: get_json_object(jsonString, path) -> varchar diff --git a/velox/docs/functions/spark/misc.rst b/velox/docs/functions/spark/misc.rst index f36a3362a3c1..de826e35b140 100644 --- a/velox/docs/functions/spark/misc.rst +++ b/velox/docs/functions/spark/misc.rst @@ -4,7 +4,7 @@ Miscellaneous Functions .. spark:function:: at_least_n_non_nulls(n, value1, value2, ..., valueN) -> bool - Returns true if there are at least ``n`` non-null and non-NaN values, + Returns true if there are at least ``n`` non-null and non-NaN values, or false otherwise. ``value1, value2, ..., valueN`` are evaluated lazily. If ``n`` non-null and non-NaN values are found, the function will stop evaluating the remaining arguments. If ``n <= 0``, the result is true. null diff --git a/velox/docs/functions/spark/string.rst b/velox/docs/functions/spark/string.rst index 7ad7141595b2..1269a1e1daad 100644 --- a/velox/docs/functions/spark/string.rst +++ b/velox/docs/functions/spark/string.rst @@ -3,9 +3,9 @@ String Functions ==================================== .. note:: - + Unless specified otherwise, all functions return NULL if at least one of the arguments is NULL. - + These functions assume that input strings contain valid UTF-8 encoded Unicode code points. The behavior is undefined if they are not. @@ -16,7 +16,7 @@ String Functions .. spark:function:: bit_length(string/binary) -> integer Returns the bit length for the specified string column. :: - + SELECT bit_length('123'); -- 24 .. spark:function:: chr(n) -> varchar @@ -47,7 +47,7 @@ String Functions .. spark:function:: contains(left, right) -> boolean Returns true if 'right' is found in 'left'. Otherwise, returns false. :: - + SELECT contains('Spark SQL', 'Spark'); -- true SELECT contains('Spark SQL', 'SPARK'); -- false SELECT contains('Spark SQL', null); -- NULL @@ -167,7 +167,7 @@ String Functions SELECT lower('SparkSql'); -- sparksql .. spark:function:: lpad(string, len, pad) -> string - + Returns ``string``, left-padded with pad to a length of ``len``. If ``string`` is longer than ``len``, the return value is shortened to ``len`` characters or bytes. If ``pad`` is not specified, ``string`` will be padded to the left with space characters @@ -249,7 +249,7 @@ String Functions .. spark:function:: repeat(input, n) -> varchar - Returns the string which repeats ``input`` ``n`` times. + Returns the string which repeats ``input`` ``n`` times. Result size must be less than or equal to 1MB. If ``n`` is less than or equal to 0, empty string is returned. :: @@ -277,15 +277,15 @@ String Functions Returns input string with characters in reverse order. .. spark:function:: rpad(string, len, pad) -> string - - Returns ``string``, right-padded with ``pad`` to a length of ``len``. + + Returns ``string``, right-padded with ``pad`` to a length of ``len``. If ``string`` is longer than ``len``, the return value is shortened to ``len`` characters. If ``pad`` is not specified, ``string`` will be padded to the right with space characters if it is a character string, and with zeros if it is a binary string. :: SELECT lpad('hi', 5, '??'); -- ???hi SELECT lpad('hi', 1, '??'); -- h - SELECT lpad('hi', 4); -- hi + SELECT lpad('hi', 4); -- hi .. spark:function:: rtrim(string) -> varchar @@ -318,7 +318,7 @@ String Functions contain all input beyond the last matched regex. When ``limit`` <= 0, ``regex`` will be applied as many times as possible, and the resulting array can be of any size. When ``delimiter`` is empty, if ``limit`` is smaller than the size of ``string``, the resulting array only contains ``limit`` number of single characters - splitting from ``string``, if ``limit`` is not provided or is larger than the size of ``string``, the resulting + splitting from ``string``, if ``limit`` is not provided or is larger than the size of ``string``, the resulting array contains all the single characters of ``string`` and does not include an empty tail character. The split function align with vanilla spark 3.4+ split function. :: @@ -357,7 +357,7 @@ String Functions Returns the rest of ``string`` from the starting position ``start``. Positions start with ``1``. A negative starting position is interpreted as being relative to the end of the string. When the starting position is 0, - the meaning is to refer to the first character.Type of 'start' must be an INTEGER. + the meaning is to refer to the first character.Type of 'start' must be an INTEGER. .. spark:function:: substring(string, start, length) -> varchar :noindex: @@ -410,8 +410,8 @@ String Functions size is larger than ``replace's``, the extra characters in ``match`` will be removed from ``string``. In addition, this function only considers the first occurrence of a character in ``match`` and uses its corresponding character in - ``replace`` for translation. - Any invalid UTF-8 characters present in the input string will be treated as a + ``replace`` for translation. + Any invalid UTF-8 characters present in the input string will be treated as a single character.:: SELECT translate('spark', 'sa', '12'); -- "1p2rk" @@ -438,3 +438,19 @@ String Functions Returns string with all characters changed to uppercase. :: SELECT upper('SparkSql'); -- SPARKSQL + +.. spark:function:: varchar_type_write_side_check(string, limit) -> varchar + + Removes trailing space characters (ASCII 32) that exceed the length ``limit`` from the end of input ``string``. ``limit`` is the maximum length of characters that can be allowed. + Throws exception when ``string`` still exceeds ``limit`` after trimming trailing spaces or when ``limit`` is not greater than 0. + Empty strings are returned as-is since they always satisfy any length ``limit`` greater than 0. + Note: This function is not directly callable in Spark SQL, but internally used for length check when writing string type columns. :: + + -- Function call examples (this function is not directly callable in Spark SQL). + varchar_type_write_side_check("abc", 3) -- "abc" + varchar_type_write_side_check("abc ", 3) -- "abc" + varchar_type_write_side_check("abcd", 3) -- VeloxUserError: "Exceeds allowed length limitation: '3'" + varchar_type_write_side_check("中国", 3) -- "中国" + varchar_type_write_side_check("中文中国", 3) -- VeloxUserError: "Exceeds allowed length limitation: '3'" + varchar_type_write_side_check(" ", 0) -- VeloxUserError: "The length limit must be greater than 0." + varchar_type_write_side_check("", 3) -- "" diff --git a/velox/docs/monitoring.rst b/velox/docs/monitoring.rst index b92026f9b637..e8f0c86c3397 100644 --- a/velox/docs/monitoring.rst +++ b/velox/docs/monitoring.rst @@ -6,4 +6,4 @@ Monitoring :maxdepth: 1 monitoring/metrics.rst - monitoring/stats.rst \ No newline at end of file + monitoring/stats.rst diff --git a/velox/docs/monitoring/stats.rst b/velox/docs/monitoring/stats.rst index 818902b58c13..030fe037e3fa 100644 --- a/velox/docs/monitoring/stats.rst +++ b/velox/docs/monitoring/stats.rst @@ -189,6 +189,9 @@ These stats are reported only by IndexLookupJoin operator - bytes - The byte size of the result data in velox vectors that are decoded from the raw data received from the remote storage lookup. + * - clientNumLazyDecodedResultBatches + - + - The number of lazy decoded result batches returned from the storage client. Spilling -------- diff --git a/velox/docs/monthly-updates/2021/december-2021.rst b/velox/docs/monthly-updates/2021/december-2021.rst index 76097bbeb183..5ec088f68531 100644 --- a/velox/docs/monthly-updates/2021/december-2021.rst +++ b/velox/docs/monthly-updates/2021/december-2021.rst @@ -35,4 +35,4 @@ Aditi Pandit, Alex Hornby, Amit Dutta, Andres Suarez, Andrew Gallagher, Chao Chen, Cheng Su, Deepak Majeti, Huameng Jiang, Jack Qiao, Kevin Wilfong, Krishna Pai, Laith Sakka, Marc Fisher, Masha Basmanova, Michael Shang, Naresh Kumar, Orri Erling, Pedro Eugenio Rocha Pedreira, Sergey Pershin, -Wei He, Wei Zheng, Xavier Deguillard, Yating Zhou, Yuan Chao Chou, Zhenyuan Zhao +Wei He, Wei Zheng, Xavier Deguillard, Yating Zhou, Yuan Chao Chou, Zhenyuan Zhao diff --git a/velox/docs/monthly-updates/2021/november-2021.rst b/velox/docs/monthly-updates/2021/november-2021.rst index 6626c5bf5c50..beb09b1110be 100644 --- a/velox/docs/monthly-updates/2021/november-2021.rst +++ b/velox/docs/monthly-updates/2021/november-2021.rst @@ -35,4 +35,4 @@ Chao Chen, Darren Fu, David Kang, Deepak Majeti, Huameng Jiang, Jake Jung, Jialiang Tan, Jialing Zhou, Justin Yang, Kevin Wilfong, Konstantin Tsoy, Krishna Pai, Laith Sakka, MJ Deng, Masha Basmanova, Michael Shang, Naresh Kumar, Orri Erling, Pedro Eugenio Rocha Pedreira, Thomas Orozco, Wei He, Yating -Zhou, Yuan Chao Chou, Zhenyuan Zhao, frankobe, ienkovich. \ No newline at end of file +Zhou, Yuan Chao Chou, Zhenyuan Zhao, frankobe, ienkovich. diff --git a/velox/docs/monthly-updates/2022/april-2022.rst b/velox/docs/monthly-updates/2022/april-2022.rst index dd44b388de8f..758b0d66fdfe 100644 --- a/velox/docs/monthly-updates/2022/april-2022.rst +++ b/velox/docs/monthly-updates/2022/april-2022.rst @@ -56,4 +56,4 @@ Majeti, Ge Gao, Huameng Jiang, James Xu, Jialiang Tan, Jimmy Lu, Jon Janzen, Jun Wu, Katie Mancini, Kevin Wilfong, Krishna Pai, Laith Sakka, Li Yazhou, MJ Deng, Masha Basmanova, Orri Erling, Pedro Eugenio Rocha Pedreira, Pyre Bot Jr, Richard Barnes, Sergey Pershin, Victor Zverovich, Wei He, Wenlei Xie, Xiang Xu, -Zeyi (Rice) Fan, qiaoyi.dingqy \ No newline at end of file +Zeyi (Rice) Fan, qiaoyi.dingqy diff --git a/velox/docs/monthly-updates/2022/august-2022.rst b/velox/docs/monthly-updates/2022/august-2022.rst index 46e21ff8a053..8208794d0922 100644 --- a/velox/docs/monthly-updates/2022/august-2022.rst +++ b/velox/docs/monthly-updates/2022/august-2022.rst @@ -85,4 +85,4 @@ Orvid King, Parvez Shaikh, Paul Saab, Pedro Eugenio Rocha Pedreira, Pramod, Pyre Bot Jr, Raúl Cumplido, Serge Druzkin, Sergey Pershin, Shiyu Gan, Shrikrishna (Shri) Khare, Taras Boiko, Victor Zverovich, Wei He, Wei Zheng, Xiaoxuan Meng, Yuan Chao Chou, Zhenyuan Zhao, erdembilegt.j, jiyu.cy, leoluan2009, -muniao, tanjialiang, usurai, yingsu00, 学东栾. \ No newline at end of file +muniao, tanjialiang, usurai, yingsu00, 学东栾. diff --git a/velox/docs/monthly-updates/2022/july-2022.rst b/velox/docs/monthly-updates/2022/july-2022.rst index 9580b11ba7ff..842a3aa9078a 100644 --- a/velox/docs/monthly-updates/2022/july-2022.rst +++ b/velox/docs/monthly-updates/2022/july-2022.rst @@ -55,4 +55,4 @@ Jialiang Tan, Jie1 Zhang, Jimmy Lu, Jonathan Mendoza, Karteek Murthy, Kevin Wilf Kimberly Yang, Krishna Pai, Laith Sakka, Masha Basmanova, Michael Shang, Naresh Kumar, Orri Erling, Orvid King, Pedro Eugenio Rocha Pedreira, PenghuiJiao, Pramod, Prasoon Telang, Scott Wolchok, Victor Zverovich, Wei He, Xavier Deguillard, Xiaoxuan Meng, Yoav Helfman, -Zeyi (Rice) Fan, Zhenyuan Zhao, usurai, yingsu00 \ No newline at end of file +Zeyi (Rice) Fan, Zhenyuan Zhao, usurai, yingsu00 diff --git a/velox/docs/monthly-updates/2022/june-2022.rst b/velox/docs/monthly-updates/2022/june-2022.rst index 8898e950f2ef..dc2dc0aa06df 100644 --- a/velox/docs/monthly-updates/2022/june-2022.rst +++ b/velox/docs/monthly-updates/2022/june-2022.rst @@ -63,4 +63,4 @@ Katie Mancini, Ke Jia, Kevin Wilfong, Krishna Pai, Laith Sakka, Masha Basmanova, Michael Shang, Mindaugas Rukas, Orri Erling, Patrick Stuedi, Paul Saab, Pedro Eugenio Rocha Pedreira, Pramod Sathyanarayana, Sahana CB, Sergey Pershin, Wei He, Xavier Deguillard, Xiaoxuan Meng, Yating Zhou, Yoav Helfman, Zeyi (Rice) Fan, -Zhenyuan Zhao, artem.malyshev, benitakbritto, frankobe, usurai, yingsu00, zhaozhenhui \ No newline at end of file +Zhenyuan Zhao, artem.malyshev, benitakbritto, frankobe, usurai, yingsu00, zhaozhenhui diff --git a/velox/docs/monthly-updates/2022/may-2022.rst b/velox/docs/monthly-updates/2022/may-2022.rst index b76e94524849..4cd772ed4bb0 100644 --- a/velox/docs/monthly-updates/2022/may-2022.rst +++ b/velox/docs/monthly-updates/2022/may-2022.rst @@ -64,4 +64,4 @@ Jialiang Tan, Jie1 Zhang, Jimmy Lu, Jing Zhu, John Reese, Karteek Murthy, Kevin Wilfong, Krishna Pai, Laith Sakka, MJ Deng, Masha Basmanova, Muir Manders, Orri Erling, Patrick Stuedi, Pedro Eugenio Rocha Pedreira, Pyre Bot Jr, Rui Mo, Sergey Pershin, TJ Yin, Wei He, Zhenyuan Zhao, artem.malyshev, rui-mo, usurai, -xuedongluan, yeyuqiang, yingsu00 \ No newline at end of file +xuedongluan, yeyuqiang, yingsu00 diff --git a/velox/docs/monthly-updates/2022/october-2022.rst b/velox/docs/monthly-updates/2022/october-2022.rst index c8663d6dd3b9..39ee8752ccc9 100644 --- a/velox/docs/monthly-updates/2022/october-2022.rst +++ b/velox/docs/monthly-updates/2022/october-2022.rst @@ -61,4 +61,4 @@ Shang, Mike Decker, Milosz Linkiewicz, Open Source Bot, Orri Erling, Patrick Somaru, Pavel Solodovnikov, Pedro Eugenio Rocha Pedreira, Pedro Pedreira, Pramod, Qitian Zeng, Randeep Singh, Raúl Cumplido, Sergey Pershin, Uhyon Chung, Vinti Pandey, Wei He, Weile Wei, Zeyi (Rice) Fan, Zhenyuan Zhao, mwish, -shengxuan.liu, tanjialiang, xiaoxmeng, yingsu00, zhejiangxiaomai \ No newline at end of file +shengxuan.liu, tanjialiang, xiaoxmeng, yingsu00, zhejiangxiaomai diff --git a/velox/docs/monthly-updates/2023/august-2023.rst b/velox/docs/monthly-updates/2023/august-2023.rst index 33e574fcf5ff..3ab81921ace9 100644 --- a/velox/docs/monthly-updates/2023/august-2023.rst +++ b/velox/docs/monthly-updates/2023/august-2023.rst @@ -82,4 +82,4 @@ Build Systems Credits ======= -Alexander Yermolovich, Amit Dutta, Ann Rose Benny, Arun D. Panicker, Ashwin Krishna Kumar, Austin Dickey, Bikramjeet Vig, Chengcheng Jin, Christian Zentgraf, Daniel Munoz, David Tolnay, Deepak Majeti, Ebe Janchivdorj, Ge Gao, Giuseppe Ottaviano, Harsha Rastogi, Hongze Zhang, Jacob Wujciak-Jens, Jia Ke, Jialiang Tan, Jimmy Lu, Karteek Murthy Samba Murthy, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, Laith Sakka, Luca Niccolini, Ma-Jian1, Mack Ward, Mahadevuni Naveen Kumar, Masha Basmanova, Mike Lui, Nick Terrell, Open Source Bot, Orri Erling, Patrick Sullivan, Pedro Eugenio Rocha Pedreira, Pedro Pedreira, Pramod, Pranjal Shankhdhar, Richard Barnes, Rong Ma, Sandino Flores, Sanjiban Sengupta, Shiyu Gan, Wei He, Zac, Zhe Wan, aditi-pandit, duanmeng, ericyuliu, generatedunixname89002005287564, generatedunixname89002005325676, jackylee-ch, leesf, root, rui-mo, wangxinshuo.db, wypb, xiaoxmeng, yingsu00, yiweiHeOSS, zhejiangxiaomai, 陈旭 \ No newline at end of file +Alexander Yermolovich, Amit Dutta, Ann Rose Benny, Arun D. Panicker, Ashwin Krishna Kumar, Austin Dickey, Bikramjeet Vig, Chengcheng Jin, Christian Zentgraf, Daniel Munoz, David Tolnay, Deepak Majeti, Ebe Janchivdorj, Ge Gao, Giuseppe Ottaviano, Harsha Rastogi, Hongze Zhang, Jacob Wujciak-Jens, Jia Ke, Jialiang Tan, Jimmy Lu, Karteek Murthy Samba Murthy, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, Laith Sakka, Luca Niccolini, Ma-Jian1, Mack Ward, Mahadevuni Naveen Kumar, Masha Basmanova, Mike Lui, Nick Terrell, Open Source Bot, Orri Erling, Patrick Sullivan, Pedro Eugenio Rocha Pedreira, Pedro Pedreira, Pramod, Pranjal Shankhdhar, Richard Barnes, Rong Ma, Sandino Flores, Sanjiban Sengupta, Shiyu Gan, Wei He, Zac, Zhe Wan, aditi-pandit, duanmeng, ericyuliu, generatedunixname89002005287564, generatedunixname89002005325676, jackylee-ch, leesf, root, rui-mo, wangxinshuo.db, wypb, xiaoxmeng, yingsu00, yiweiHeOSS, zhejiangxiaomai, 陈旭 diff --git a/velox/docs/monthly-updates/2023/december-2023.rst b/velox/docs/monthly-updates/2023/december-2023.rst index 8f4df40a953b..021d32e6f770 100644 --- a/velox/docs/monthly-updates/2023/december-2023.rst +++ b/velox/docs/monthly-updates/2023/december-2023.rst @@ -96,4 +96,4 @@ Patrick Sullivan, Pedro Eugenio Rocha Pedreira, Pedro Pedreira, Pramod,Ravi Rahm Richard Barnes, Sergey Pershin, Srikrishna Gopu, Wei He, Xiaoxuan Meng, Yangyang Gao, Yedidya Feldblum, Zac, aditi-pandit, binwei, duanmeng, hengjiang.ly, joey.ljy, rui-mo, shangjing.cxw, soumyaduriseti, xiaoxmeng, xiyu.zk, xumingming, yan ma, yangchuan ,yingsu00, -zhli, zhli1142015, 高阳阳 \ No newline at end of file +zhli, zhli1142015, 高阳阳 diff --git a/velox/docs/monthly-updates/2023/november-2023.rst b/velox/docs/monthly-updates/2023/november-2023.rst index 40bbf2525bc8..536ea83d167c 100644 --- a/velox/docs/monthly-updates/2023/november-2023.rst +++ b/velox/docs/monthly-updates/2023/november-2023.rst @@ -102,4 +102,4 @@ Daniel Munoz, Deepak Majeti, Ge Gao, Genevieve (Genna) Helsel, Harvey Hunt, Jake Jimmy Lu, John Elliott, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, Laith Sakka, Masha Basmanova, Orri Erling, PHILO-HE, Patrick Sullivan, Pedro Eugenio Rocha Pedreira, Pramod, Richard Barnes, Schierbeck, Cody, Sergey Pershin, Wei He, Zhenyuan Zhao, aditi-pandit, curt, duanmeng, joey.ljy, lingbin, rui-mo, usurai, vibhatha, wypb, xiaoxmeng, -xumingming, yangchuan, yaqi-zhao, yingsu00, yiweiHeOSS, youxiduo, zhli, 高阳阳 \ No newline at end of file +xumingming, yangchuan, yaqi-zhao, yingsu00, yiweiHeOSS, youxiduo, zhli, 高阳阳 diff --git a/velox/docs/monthly-updates/january-2024.rst b/velox/docs/monthly-updates/january-2024.rst index dc917581e8e6..eeae8d367219 100644 --- a/velox/docs/monthly-updates/january-2024.rst +++ b/velox/docs/monthly-updates/january-2024.rst @@ -81,4 +81,3 @@ Cody, Sergey Pershin, Sitao Lv, Taras Galkovskyi, Wei He, Yedidya Feldblum, Yuan Zhou, Yuping Fan, Zac Wen, aditi-pandit, binwei, duanmeng, hengjiang.ly, icejoywoo, lingbin, mwish, rui-mo, wypb, xiaoxmeng, xumingming, yangchuan, yingsu00, youxiduo, yuling.sh, zhli1142015, zky.zhoukeyong, zwangsheng - diff --git a/velox/docs/monthly-updates/july-2024.rst b/velox/docs/monthly-updates/july-2024.rst index 26e34b91135a..4e1dc0c78289 100644 --- a/velox/docs/monthly-updates/july-2024.rst +++ b/velox/docs/monthly-updates/july-2024.rst @@ -125,4 +125,4 @@ Credits 5 xiaoxmeng - Meta 2 Ying Su - IBM 2 youxiduo - 12 Zhen Li - Microsoft \ No newline at end of file + 12 Zhen Li - Microsoft diff --git a/velox/dwio/common/SelectiveRepeatedColumnReader.cpp b/velox/dwio/common/SelectiveRepeatedColumnReader.cpp index ba6600002651..5342e4591e15 100644 --- a/velox/dwio/common/SelectiveRepeatedColumnReader.cpp +++ b/velox/dwio/common/SelectiveRepeatedColumnReader.cpp @@ -254,6 +254,9 @@ void SelectiveListColumnReader::read( makeNestedRowSet(activeRows, rows.back()); if (child_ && !nestedRows_.empty()) { child_->read(child_->readOffset(), nestedRows_, nullptr); + nestedRowsAllSelected_ = nestedRowsAllSelected_ && + nestedRows_.size() == child_->outputRows().size(); + nestedRows_ = child_->outputRows(); } numValues_ = activeRows.size(); readOffset_ = offset + rows.back() + 1; @@ -338,6 +341,9 @@ void SelectiveMapColumnReader::read( nestedRows_ = keyReader_->outputRows(); if (!nestedRows_.empty()) { elementReader_->read(elementReader_->readOffset(), nestedRows_, nullptr); + nestedRowsAllSelected_ = nestedRowsAllSelected_ && + nestedRows_.size() == elementReader_->outputRows().size(); + nestedRows_ = elementReader_->outputRows(); } } numValues_ = activeRows.size(); diff --git a/velox/dwio/common/tests/Lemire/FastPFor/LICENSE b/velox/dwio/common/tests/Lemire/FastPFor/LICENSE index 8405e89a0b12..37ec93a14fdc 100644 --- a/velox/dwio/common/tests/Lemire/FastPFor/LICENSE +++ b/velox/dwio/common/tests/Lemire/FastPFor/LICENSE @@ -188,4 +188,4 @@ third-party archives. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file + limitations under the License. diff --git a/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp b/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp index 446aaceee766..f7656fbd08ca 100644 --- a/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp +++ b/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp @@ -681,8 +681,18 @@ void E2EFilterTestBase::testSubfieldsPruning() { [](auto) { return 1; }, [](auto) { return 0; }, [](auto) { return "foofoofoofoofoo"_sv; }); - batches.push_back( - vectorMaker.rowVector({"a", "b", "c", "d"}, {a, b, c, d})); + auto e = vectorMaker.mapVector( + batchSize_, + [&](auto) { return kMapSize; }, + [](auto j) { return j; }, + [&](auto j) { return j % kMapSize; }); + auto f = vectorMaker.arrayVector( + batchSize_, + [&](auto j) { return kMapSize; }, + [&](auto j) { return j % kMapSize; }, + [&](auto j) { return j >= i + 1 && j % 23 == (i + 1) % 23; }); + batches.push_back(vectorMaker.rowVector( + {"a", "b", "c", "d", "e", "f"}, {a, b, c, d, e, f})); } writeToMemory(batches[0]->type(), batches, false); auto spec = std::make_shared(""); @@ -707,6 +717,12 @@ void E2EFilterTestBase::testSubfieldsPruning() { auto specD = spec->addFieldRecursively("d", *MAP(BIGINT(), VARCHAR()), 3); specD->childByName(common::ScanSpec::kMapKeysFieldName) ->setFilter(common::createBigintValues({1}, false)); + auto specE = spec->addFieldRecursively("e", *MAP(BIGINT(), BIGINT()), 4); + specE->childByName(common::ScanSpec::kMapValuesFieldName) + ->setFilter(common::createBigintValues({0, 2, 4}, false)); + auto specF = spec->addFieldRecursively("f", *ARRAY(BIGINT()), 5); + specF->childByName(common::ScanSpec::kArrayElementsFieldName) + ->setFilter(common::createBigintValues({0, 2, 4}, false)); ReaderOptions readerOpts{leafPool_.get()}; RowReaderOptions rowReaderOpts; auto input = std::make_unique( @@ -756,6 +772,31 @@ void E2EFilterTestBase::testSubfieldsPruning() { auto* dd = actual->childAt(3)->loadedVector()->asUnchecked(); ASSERT_FALSE(dd->isNullAt(ii)); ASSERT_EQ(dd->sizeAt(ii), 0); + auto* e = expected->childAt(4)->asUnchecked(); + auto* ee = actual->childAt(4)->loadedVector()->asUnchecked(); + ASSERT_FALSE(ee->isNullAt(ii)); + ASSERT_EQ(ee->sizeAt(ii), (kMapSize + 1) / 2); + for (int k = 0; k < kMapSize; k += 2) { + int k1 = ee->offsetAt(ii) + k / 2; + int k2 = e->offsetAt(j) + k; + ASSERT_TRUE(ee->mapKeys()->equalValueAt(e->mapKeys().get(), k1, k2)); + ASSERT_TRUE( + ee->mapValues()->equalValueAt(e->mapValues().get(), k1, k2)); + } + auto* f = expected->childAt(5)->asUnchecked(); + auto* ff = actual->childAt(5)->loadedVector()->asUnchecked(); + if (f->isNullAt(j)) { + ASSERT_TRUE(ff->isNullAt(ii)); + } else { + ASSERT_FALSE(ff->isNullAt(ii)); + for (int k = 0; k < kMapSize; k += 2) { + int k1 = ff->offsetAt(ii) + k / 2; + int k2 = f->offsetAt(j) + k; + + ASSERT_TRUE( + ff->elements()->equalValueAt(f->elements().get(), k1, k2)); + } + } ++ii; } } diff --git a/velox/dwio/dwrf/test/CommonTests.cpp b/velox/dwio/dwrf/test/CommonTests.cpp index e1c94cae1ddf..03b69182c498 100644 --- a/velox/dwio/dwrf/test/CommonTests.cpp +++ b/velox/dwio/dwrf/test/CommonTests.cpp @@ -95,7 +95,7 @@ TEST_F( {proto::Stream_Kind_DICTIONARY_COUNT, StreamKind::StreamKind_DICTIONARY_COUNT}, {proto::Stream_Kind_NANO_DATA, StreamKind::StreamKind_NANO_DATA}, {proto::Stream_Kind_ROW_INDEX, StreamKind::StreamKind_ROW_INDEX}, - {proto::Stream_Kind_IN_DICTIONARY, StreamKind::StreamKind_IN_DICTIONARY}, {proto::Stream_Kind_STRIDE_DICTIONARY, StreamKind::StreamKind_STRIDE_DICTIONARY}, + {proto::Stream_Kind_IN_DICTIONARY, StreamKind::StreamKind_IN_DICTIONARY}, {proto::Stream_Kind_STRIDE_DICTIONARY, StreamKind::StreamKind_STRIDE_DICTIONARY}, {proto::Stream_Kind_STRIDE_DICTIONARY_LENGTH, StreamKind::StreamKind_STRIDE_DICTIONARY_LENGTH}, {proto::Stream_Kind_BLOOM_FILTER_UTF8, StreamKind::StreamKind_BLOOM_FILTER_UTF8}, {proto::Stream_Kind_IN_MAP, StreamKind::StreamKind_IN_MAP}, @@ -145,7 +145,7 @@ TEST_F( {proto::Stream_Kind_DICTIONARY_COUNT, StreamKind::StreamKind_DICTIONARY_COUNT}, {proto::Stream_Kind_NANO_DATA, StreamKind::StreamKind_NANO_DATA}, {proto::Stream_Kind_ROW_INDEX, StreamKind::StreamKind_ROW_INDEX}, - {proto::Stream_Kind_IN_DICTIONARY, StreamKind::StreamKind_IN_DICTIONARY}, {proto::Stream_Kind_STRIDE_DICTIONARY, StreamKind::StreamKind_STRIDE_DICTIONARY}, + {proto::Stream_Kind_IN_DICTIONARY, StreamKind::StreamKind_IN_DICTIONARY}, {proto::Stream_Kind_STRIDE_DICTIONARY, StreamKind::StreamKind_STRIDE_DICTIONARY}, {proto::Stream_Kind_STRIDE_DICTIONARY_LENGTH, StreamKind::StreamKind_STRIDE_DICTIONARY_LENGTH}, {proto::Stream_Kind_BLOOM_FILTER_UTF8, StreamKind::StreamKind_BLOOM_FILTER_UTF8}, {proto::Stream_Kind_IN_MAP, StreamKind::StreamKind_IN_MAP}, diff --git a/velox/dwio/parquet/thrift/ParquetThriftTypes.cpp b/velox/dwio/parquet/thrift/ParquetThriftTypes.cpp index 599ee76ace5e..674c99300f76 100644 --- a/velox/dwio/parquet/thrift/ParquetThriftTypes.cpp +++ b/velox/dwio/parquet/thrift/ParquetThriftTypes.cpp @@ -849,21 +849,16 @@ void Statistics::printTo(std::ostream& out) const { out << "Statistics("; out << "max="; (__isset.max ? (out << to_string(max)) : (out << "")); - out << ", " - << "min="; + out << ", " << "min="; (__isset.min ? (out << to_string(min)) : (out << "")); - out << ", " - << "null_count="; + out << ", " << "null_count="; (__isset.null_count ? (out << to_string(null_count)) : (out << "")); - out << ", " - << "distinct_count="; + out << ", " << "distinct_count="; (__isset.distinct_count ? (out << to_string(distinct_count)) : (out << "")); - out << ", " - << "max_value="; + out << ", " << "max_value="; (__isset.max_value ? (out << to_string(max_value)) : (out << "")); - out << ", " - << "min_value="; + out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "")); out << ")"; } @@ -1365,8 +1360,7 @@ void DecimalType::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "DecimalType("; out << "scale=" << to_string(scale); - out << ", " - << "precision=" << to_string(precision); + out << ", " << "precision=" << to_string(precision); out << ")"; } @@ -1669,11 +1663,9 @@ void TimeUnit::printTo(std::ostream& out) const { out << "TimeUnit("; out << "MILLIS="; (__isset.MILLIS ? (out << to_string(MILLIS)) : (out << "")); - out << ", " - << "MICROS="; + out << ", " << "MICROS="; (__isset.MICROS ? (out << to_string(MICROS)) : (out << "")); - out << ", " - << "NANOS="; + out << ", " << "NANOS="; (__isset.NANOS ? (out << to_string(NANOS)) : (out << "")); out << ")"; } @@ -1784,8 +1776,7 @@ void TimestampType::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "TimestampType("; out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC); - out << ", " - << "unit=" << to_string(unit); + out << ", " << "unit=" << to_string(unit); out << ")"; } @@ -1894,8 +1885,7 @@ void TimeType::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "TimeType("; out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC); - out << ", " - << "unit=" << to_string(unit); + out << ", " << "unit=" << to_string(unit); out << ")"; } @@ -2004,8 +1994,7 @@ void IntType::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "IntType("; out << "bitWidth=" << to_string(bitWidth); - out << ", " - << "isSigned=" << to_string(isSigned); + out << ", " << "isSigned=" << to_string(isSigned); out << ")"; } @@ -2470,41 +2459,29 @@ void LogicalType::printTo(std::ostream& out) const { out << "LogicalType("; out << "STRING="; (__isset.STRING ? (out << to_string(STRING)) : (out << "")); - out << ", " - << "MAP="; + out << ", " << "MAP="; (__isset.MAP ? (out << to_string(MAP)) : (out << "")); - out << ", " - << "LIST="; + out << ", " << "LIST="; (__isset.LIST ? (out << to_string(LIST)) : (out << "")); - out << ", " - << "ENUM="; + out << ", " << "ENUM="; (__isset.ENUM ? (out << to_string(ENUM)) : (out << "")); - out << ", " - << "DECIMAL="; + out << ", " << "DECIMAL="; (__isset.DECIMAL ? (out << to_string(DECIMAL)) : (out << "")); - out << ", " - << "DATE="; + out << ", " << "DATE="; (__isset.DATE ? (out << to_string(DATE)) : (out << "")); - out << ", " - << "TIME="; + out << ", " << "TIME="; (__isset.TIME ? (out << to_string(TIME)) : (out << "")); - out << ", " - << "TIMESTAMP="; + out << ", " << "TIMESTAMP="; (__isset.TIMESTAMP ? (out << to_string(TIMESTAMP)) : (out << "")); - out << ", " - << "INTEGER="; + out << ", " << "INTEGER="; (__isset.INTEGER ? (out << to_string(INTEGER)) : (out << "")); - out << ", " - << "UNKNOWN="; + out << ", " << "UNKNOWN="; (__isset.UNKNOWN ? (out << to_string(UNKNOWN)) : (out << "")); - out << ", " - << "JSON="; + out << ", " << "JSON="; (__isset.JSON ? (out << to_string(JSON)) : (out << "")); - out << ", " - << "BSON="; + out << ", " << "BSON="; (__isset.BSON ? (out << to_string(BSON)) : (out << "")); - out << ", " - << "UUID="; + out << ", " << "UUID="; (__isset.UUID ? (out << to_string(UUID)) : (out << "")); out << ")"; } @@ -2801,33 +2778,24 @@ void SchemaElement::printTo(std::ostream& out) const { out << "SchemaElement("; out << "type="; (__isset.type ? (out << to_string(type)) : (out << "")); - out << ", " - << "type_length="; + out << ", " << "type_length="; (__isset.type_length ? (out << to_string(type_length)) : (out << "")); - out << ", " - << "repetition_type="; + out << ", " << "repetition_type="; (__isset.repetition_type ? (out << to_string(repetition_type)) : (out << "")); - out << ", " - << "name=" << to_string(name); - out << ", " - << "num_children="; + out << ", " << "name=" << to_string(name); + out << ", " << "num_children="; (__isset.num_children ? (out << to_string(num_children)) : (out << "")); - out << ", " - << "converted_type="; + out << ", " << "converted_type="; (__isset.converted_type ? (out << to_string(converted_type)) : (out << "")); - out << ", " - << "scale="; + out << ", " << "scale="; (__isset.scale ? (out << to_string(scale)) : (out << "")); - out << ", " - << "precision="; + out << ", " << "precision="; (__isset.precision ? (out << to_string(precision)) : (out << "")); - out << ", " - << "field_id="; + out << ", " << "field_id="; (__isset.field_id ? (out << to_string(field_id)) : (out << "")); - out << ", " - << "logicalType="; + out << ", " << "logicalType="; (__isset.logicalType ? (out << to_string(logicalType)) : (out << "")); out << ")"; } @@ -3019,14 +2987,12 @@ void DataPageHeader::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "DataPageHeader("; out << "num_values=" << to_string(num_values); - out << ", " - << "encoding=" << to_string(encoding); + out << ", " << "encoding=" << to_string(encoding); out << ", " << "definition_level_encoding=" << to_string(definition_level_encoding); out << ", " << "repetition_level_encoding=" << to_string(repetition_level_encoding); - out << ", " - << "statistics="; + out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "")); out << ")"; } @@ -3225,10 +3191,8 @@ void DictionaryPageHeader::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "DictionaryPageHeader("; out << "num_values=" << to_string(num_values); - out << ", " - << "encoding=" << to_string(encoding); - out << ", " - << "is_sorted="; + out << ", " << "encoding=" << to_string(encoding); + out << ", " << "is_sorted="; (__isset.is_sorted ? (out << to_string(is_sorted)) : (out << "")); out << ")"; } @@ -3480,24 +3444,17 @@ void DataPageHeaderV2::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "DataPageHeaderV2("; out << "num_values=" << to_string(num_values); - out << ", " - << "num_nulls=" << to_string(num_nulls); - out << ", " - << "num_rows=" << to_string(num_rows); - out << ", " - << "encoding=" << to_string(encoding); - out << ", " - << "definition_levels_byte_length=" + out << ", " << "num_nulls=" << to_string(num_nulls); + out << ", " << "num_rows=" << to_string(num_rows); + out << ", " << "encoding=" << to_string(encoding); + out << ", " << "definition_levels_byte_length=" << to_string(definition_levels_byte_length); - out << ", " - << "repetition_levels_byte_length=" + out << ", " << "repetition_levels_byte_length=" << to_string(repetition_levels_byte_length); - out << ", " - << "is_compressed="; + out << ", " << "is_compressed="; (__isset.is_compressed ? (out << to_string(is_compressed)) : (out << "")); - out << ", " - << "statistics="; + out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "")); out << ")"; } @@ -4098,12 +4055,9 @@ void BloomFilterHeader::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "BloomFilterHeader("; out << "numBytes=" << to_string(numBytes); - out << ", " - << "algorithm=" << to_string(algorithm); - out << ", " - << "hash=" << to_string(hash); - out << ", " - << "compression=" << to_string(compression); + out << ", " << "algorithm=" << to_string(algorithm); + out << ", " << "hash=" << to_string(hash); + out << ", " << "compression=" << to_string(compression); out << ")"; } @@ -4348,27 +4302,20 @@ void PageHeader::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "PageHeader("; out << "type=" << to_string(type); - out << ", " - << "uncompressed_page_size=" << to_string(uncompressed_page_size); - out << ", " - << "compressed_page_size=" << to_string(compressed_page_size); - out << ", " - << "crc="; + out << ", " << "uncompressed_page_size=" << to_string(uncompressed_page_size); + out << ", " << "compressed_page_size=" << to_string(compressed_page_size); + out << ", " << "crc="; (__isset.crc ? (out << to_string(crc)) : (out << "")); - out << ", " - << "data_page_header="; + out << ", " << "data_page_header="; (__isset.data_page_header ? (out << to_string(data_page_header)) : (out << "")); - out << ", " - << "index_page_header="; + out << ", " << "index_page_header="; (__isset.index_page_header ? (out << to_string(index_page_header)) : (out << "")); - out << ", " - << "dictionary_page_header="; + out << ", " << "dictionary_page_header="; (__isset.dictionary_page_header ? (out << to_string(dictionary_page_header)) : (out << "")); - out << ", " - << "data_page_header_v2="; + out << ", " << "data_page_header_v2="; (__isset.data_page_header_v2 ? (out << to_string(data_page_header_v2)) : (out << "")); out << ")"; @@ -4481,8 +4428,7 @@ void KeyValue::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "KeyValue("; out << "key=" << to_string(key); - out << ", " - << "value="; + out << ", " << "value="; (__isset.value ? (out << to_string(value)) : (out << "")); out << ")"; } @@ -4616,10 +4562,8 @@ void SortingColumn::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "SortingColumn("; out << "column_idx=" << to_string(column_idx); - out << ", " - << "descending=" << to_string(descending); - out << ", " - << "nulls_first=" << to_string(nulls_first); + out << ", " << "descending=" << to_string(descending); + out << ", " << "nulls_first=" << to_string(nulls_first); out << ")"; } @@ -4756,10 +4700,8 @@ void PageEncodingStats::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "PageEncodingStats("; out << "page_type=" << to_string(page_type); - out << ", " - << "encoding=" << to_string(encoding); - out << ", " - << "count=" << to_string(count); + out << ", " << "encoding=" << to_string(encoding); + out << ", " << "count=" << to_string(count); out << ")"; } @@ -5235,41 +5177,29 @@ void ColumnMetaData::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "ColumnMetaData("; out << "type=" << to_string(type); - out << ", " - << "encodings=" << to_string(encodings); - out << ", " - << "path_in_schema=" << to_string(path_in_schema); - out << ", " - << "codec=" << to_string(codec); - out << ", " - << "num_values=" << to_string(num_values); + out << ", " << "encodings=" << to_string(encodings); + out << ", " << "path_in_schema=" << to_string(path_in_schema); + out << ", " << "codec=" << to_string(codec); + out << ", " << "num_values=" << to_string(num_values); out << ", " << "total_uncompressed_size=" << to_string(total_uncompressed_size); - out << ", " - << "total_compressed_size=" << to_string(total_compressed_size); - out << ", " - << "key_value_metadata="; + out << ", " << "total_compressed_size=" << to_string(total_compressed_size); + out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "")); - out << ", " - << "data_page_offset=" << to_string(data_page_offset); - out << ", " - << "index_page_offset="; + out << ", " << "data_page_offset=" << to_string(data_page_offset); + out << ", " << "index_page_offset="; (__isset.index_page_offset ? (out << to_string(index_page_offset)) : (out << "")); - out << ", " - << "dictionary_page_offset="; + out << ", " << "dictionary_page_offset="; (__isset.dictionary_page_offset ? (out << to_string(dictionary_page_offset)) : (out << "")); - out << ", " - << "statistics="; + out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "")); - out << ", " - << "encoding_stats="; + out << ", " << "encoding_stats="; (__isset.encoding_stats ? (out << to_string(encoding_stats)) : (out << "")); - out << ", " - << "bloom_filter_offset="; + out << ", " << "bloom_filter_offset="; (__isset.bloom_filter_offset ? (out << to_string(bloom_filter_offset)) : (out << "")); out << ")"; @@ -5473,8 +5403,7 @@ void EncryptionWithColumnKey::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "EncryptionWithColumnKey("; out << "path_in_schema=" << to_string(path_in_schema); - out << ", " - << "key_metadata="; + out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "")); out << ")"; } @@ -5593,8 +5522,7 @@ void ColumnCryptoMetaData::printTo(std::ostream& out) const { (__isset.ENCRYPTION_WITH_FOOTER_KEY ? (out << to_string(ENCRYPTION_WITH_FOOTER_KEY)) : (out << "")); - out << ", " - << "ENCRYPTION_WITH_COLUMN_KEY="; + out << ", " << "ENCRYPTION_WITH_COLUMN_KEY="; (__isset.ENCRYPTION_WITH_COLUMN_KEY ? (out << to_string(ENCRYPTION_WITH_COLUMN_KEY)) : (out << "")); @@ -5864,33 +5792,25 @@ void ColumnChunk::printTo(std::ostream& out) const { out << "ColumnChunk("; out << "file_path="; (__isset.file_path ? (out << to_string(file_path)) : (out << "")); - out << ", " - << "file_offset=" << to_string(file_offset); - out << ", " - << "meta_data="; + out << ", " << "file_offset=" << to_string(file_offset); + out << ", " << "meta_data="; (__isset.meta_data ? (out << to_string(meta_data)) : (out << "")); - out << ", " - << "offset_index_offset="; + out << ", " << "offset_index_offset="; (__isset.offset_index_offset ? (out << to_string(offset_index_offset)) : (out << "")); - out << ", " - << "offset_index_length="; + out << ", " << "offset_index_length="; (__isset.offset_index_length ? (out << to_string(offset_index_length)) : (out << "")); - out << ", " - << "column_index_offset="; + out << ", " << "column_index_offset="; (__isset.column_index_offset ? (out << to_string(column_index_offset)) : (out << "")); - out << ", " - << "column_index_length="; + out << ", " << "column_index_length="; (__isset.column_index_length ? (out << to_string(column_index_length)) : (out << "")); - out << ", " - << "crypto_metadata="; + out << ", " << "crypto_metadata="; (__isset.crypto_metadata ? (out << to_string(crypto_metadata)) : (out << "")); - out << ", " - << "encrypted_column_metadata="; + out << ", " << "encrypted_column_metadata="; (__isset.encrypted_column_metadata ? (out << to_string(encrypted_column_metadata)) : (out << "")); @@ -6159,23 +6079,17 @@ void RowGroup::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "RowGroup("; out << "columns=" << to_string(columns); - out << ", " - << "total_byte_size=" << to_string(total_byte_size); - out << ", " - << "num_rows=" << to_string(num_rows); - out << ", " - << "sorting_columns="; + out << ", " << "total_byte_size=" << to_string(total_byte_size); + out << ", " << "num_rows=" << to_string(num_rows); + out << ", " << "sorting_columns="; (__isset.sorting_columns ? (out << to_string(sorting_columns)) : (out << "")); - out << ", " - << "file_offset="; + out << ", " << "file_offset="; (__isset.file_offset ? (out << to_string(file_offset)) : (out << "")); - out << ", " - << "total_compressed_size="; + out << ", " << "total_compressed_size="; (__isset.total_compressed_size ? (out << to_string(total_compressed_size)) : (out << "")); - out << ", " - << "ordinal="; + out << ", " << "ordinal="; (__isset.ordinal ? (out << to_string(ordinal)) : (out << "")); out << ")"; } @@ -6455,10 +6369,8 @@ void PageLocation::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "PageLocation("; out << "offset=" << to_string(offset); - out << ", " - << "compressed_page_size=" << to_string(compressed_page_size); - out << ", " - << "first_row_index=" << to_string(first_row_index); + out << ", " << "compressed_page_size=" << to_string(compressed_page_size); + out << ", " << "first_row_index=" << to_string(first_row_index); out << ")"; } @@ -6837,14 +6749,10 @@ void ColumnIndex::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "ColumnIndex("; out << "null_pages=" << to_string(null_pages); - out << ", " - << "min_values=" << to_string(min_values); - out << ", " - << "max_values=" << to_string(max_values); - out << ", " - << "boundary_order=" << to_string(boundary_order); - out << ", " - << "null_counts="; + out << ", " << "min_values=" << to_string(min_values); + out << ", " << "max_values=" << to_string(max_values); + out << ", " << "boundary_order=" << to_string(boundary_order); + out << ", " << "null_counts="; (__isset.null_counts ? (out << to_string(null_counts)) : (out << "")); out << ")"; } @@ -6977,12 +6885,10 @@ void AesGcmV1::printTo(std::ostream& out) const { out << "AesGcmV1("; out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "")); - out << ", " - << "aad_file_unique="; + out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "")); - out << ", " - << "supply_aad_prefix="; + out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "")); out << ")"; @@ -7117,12 +7023,10 @@ void AesGcmCtrV1::printTo(std::ostream& out) const { out << "AesGcmCtrV1("; out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "")); - out << ", " - << "aad_file_unique="; + out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "")); - out << ", " - << "supply_aad_prefix="; + out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "")); out << ")"; @@ -7237,8 +7141,7 @@ void EncryptionAlgorithm::printTo(std::ostream& out) const { out << "EncryptionAlgorithm("; out << "AES_GCM_V1="; (__isset.AES_GCM_V1 ? (out << to_string(AES_GCM_V1)) : (out << "")); - out << ", " - << "AES_GCM_CTR_V1="; + out << ", " << "AES_GCM_CTR_V1="; (__isset.AES_GCM_CTR_V1 ? (out << to_string(AES_GCM_CTR_V1)) : (out << "")); out << ")"; @@ -7596,29 +7499,21 @@ void FileMetaData::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "FileMetaData("; out << "version=" << to_string(version); - out << ", " - << "schema=" << to_string(schema); - out << ", " - << "num_rows=" << to_string(num_rows); - out << ", " - << "row_groups=" << to_string(row_groups); - out << ", " - << "key_value_metadata="; + out << ", " << "schema=" << to_string(schema); + out << ", " << "num_rows=" << to_string(num_rows); + out << ", " << "row_groups=" << to_string(row_groups); + out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "")); - out << ", " - << "created_by="; + out << ", " << "created_by="; (__isset.created_by ? (out << to_string(created_by)) : (out << "")); - out << ", " - << "column_orders="; + out << ", " << "column_orders="; (__isset.column_orders ? (out << to_string(column_orders)) : (out << "")); - out << ", " - << "encryption_algorithm="; + out << ", " << "encryption_algorithm="; (__isset.encryption_algorithm ? (out << to_string(encryption_algorithm)) : (out << "")); - out << ", " - << "footer_signing_key_metadata="; + out << ", " << "footer_signing_key_metadata="; (__isset.footer_signing_key_metadata ? (out << to_string(footer_signing_key_metadata)) : (out << "")); @@ -7736,8 +7631,7 @@ void FileCryptoMetaData::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "FileCryptoMetaData("; out << "encryption_algorithm=" << to_string(encryption_algorithm); - out << ", " - << "key_metadata="; + out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "")); out << ")"; } diff --git a/velox/exec/FilterProject.cpp b/velox/exec/FilterProject.cpp index 4b095f3fb55c..5aaf4f7e0d93 100644 --- a/velox/exec/FilterProject.cpp +++ b/velox/exec/FilterProject.cpp @@ -89,7 +89,8 @@ FilterProject::FilterProject( project_(project), filter_(filter) { if (filter_ != nullptr && project_ != nullptr) { - stats().withWLock([&](auto& stats) { + folly::Synchronized& opStats = Operator::stats(); + opStats.withWLock([&](auto& stats) { stats.setStatSplitter( [filterId = filter_->id()](const auto& combinedStats) { return splitStats(combinedStats, filterId); @@ -229,4 +230,16 @@ vector_size_t FilterProject::filter( exprs_->eval(0, 1, true, allRows, evalCtx, results); return processFilterResults(results[0], allRows, filterEvalCtx_, pool()); } + +OperatorStats FilterProject::stats(bool clear) { + auto stats = Operator::stats(clear); + if (operatorCtx() + ->driverCtx() + ->queryConfig() + .operatorTrackExpressionStats() && + exprs_ != nullptr) { + stats.expressionStats = exprs_->stats(); + } + return stats; +} } // namespace facebook::velox::exec diff --git a/velox/exec/FilterProject.h b/velox/exec/FilterProject.h index d82f2cb2a400..ebab858cd120 100644 --- a/velox/exec/FilterProject.h +++ b/velox/exec/FilterProject.h @@ -78,6 +78,10 @@ class FilterProject : public Operator { void initialize() override; + /// Ensures that expression stats are added to the operator stats if their + /// tracking is enabled via query config. + OperatorStats stats(bool clear) override; + private: // Tests if 'numProcessedRows_' equals to the length of input_ and clears // outstanding references to input_ if done. Returns true if getOutput diff --git a/velox/exec/IndexLookupJoin.h b/velox/exec/IndexLookupJoin.h index 30860f032e89..67f8d843f929 100644 --- a/velox/exec/IndexLookupJoin.h +++ b/velox/exec/IndexLookupJoin.h @@ -77,6 +77,9 @@ class IndexLookupJoin : public Operator { /// the raw data received from the remote storage lookup. static inline const std::string kClientLookupResultSize{ "clientLookupResultSize"}; + /// The number of lazy decoded result batches. + static inline const std::string kClientNumLazyDecodedResultBatches{ + "clientNumLazyDecodedResultBatches"}; private: using LookupResultIter = connector::IndexSource::LookupResultIterator; diff --git a/velox/exec/Operator.cpp b/velox/exec/Operator.cpp index 44f618ae386a..668dab4d5fcc 100644 --- a/velox/exec/Operator.cpp +++ b/velox/exec/Operator.cpp @@ -592,6 +592,14 @@ void OperatorStats::add(const OperatorStats& other) { } } + for (const auto& [name, exprStats] : other.expressionStats) { + if (UNLIKELY(expressionStats.count(name) == 0)) { + expressionStats.insert(std::make_pair(name, exprStats)); + } else { + expressionStats.at(name).add(exprStats); + } + } + numDrivers += other.numDrivers; spilledInputBytes += other.spilledInputBytes; spilledBytes += other.spilledBytes; @@ -628,6 +636,7 @@ void OperatorStats::clear() { memoryStats.clear(); runtimeStats.clear(); + expressionStats.clear(); numDrivers = 0; spilledInputBytes = 0; diff --git a/velox/exec/OperatorStats.h b/velox/exec/OperatorStats.h index f833b1b10403..ce1f72af9c9d 100644 --- a/velox/exec/OperatorStats.h +++ b/velox/exec/OperatorStats.h @@ -17,6 +17,7 @@ #include "velox/common/memory/MemoryPool.h" #include "velox/common/time/CpuWallTimer.h" +#include "velox/expression/ExprStats.h" namespace facebook::velox::exec { @@ -181,6 +182,11 @@ struct OperatorStats { std::unordered_map runtimeStats; + // A map of expression name to its respective stats. + // These are only populated when a copy of the stats is returned via + // Operator::stats(bool) API. + std::unordered_map expressionStats; + int numDrivers = 0; OperatorStats() = default; diff --git a/velox/exec/PlanNodeStats.cpp b/velox/exec/PlanNodeStats.cpp index 5306a642b5a0..320bf8f16bd2 100644 --- a/velox/exec/PlanNodeStats.cpp +++ b/velox/exec/PlanNodeStats.cpp @@ -131,6 +131,14 @@ void PlanNodeStats::addTotals(const OperatorStats& stats) { } } + for (const auto& [name, exprStats] : stats.expressionStats) { + if (UNLIKELY(this->expressionStats.count(name) == 0)) { + this->expressionStats.insert(std::make_pair(name, exprStats)); + } else { + this->expressionStats.at(name).add(exprStats); + } + } + // Populating number of drivers for plan nodes with multiple operators is not // useful. Each operator could have been executed in different pipelines with // different number of drivers. diff --git a/velox/exec/PlanNodeStats.h b/velox/exec/PlanNodeStats.h index 8c41d7a0a46d..c1c6dbc00ee9 100644 --- a/velox/exec/PlanNodeStats.h +++ b/velox/exec/PlanNodeStats.h @@ -18,6 +18,7 @@ #include #include "velox/common/time/CpuWallTimer.h" #include "velox/exec/Operator.h" +#include "velox/expression/ExprStats.h" namespace facebook::velox::exec { struct TaskStats; @@ -142,6 +143,9 @@ struct PlanNodeStats { /// Total spilled files. uint32_t spilledFiles{0}; + /// A map of expression name to its respective stats. + std::unordered_map expressionStats; + /// Add stats for a single operator instance. void add(const OperatorStats& stats); diff --git a/velox/exec/TraceUtil.cpp b/velox/exec/TraceUtil.cpp index 59734f032788..f63e97b303a0 100644 --- a/velox/exec/TraceUtil.cpp +++ b/velox/exec/TraceUtil.cpp @@ -421,6 +421,18 @@ core::PlanNodePtr getTraceNode( tableWriteNode->sources().front()->outputType())); } + if (const auto* unnestNode = + dynamic_cast(traceNode)) { + return std::make_shared( + nodeId, + unnestNode->replicateVariables(), + unnestNode->unnestVariables(), + unnestNode->unnestNames(), + unnestNode->ordinalityName(), + std::make_shared( + unnestNode->sources().front()->outputType())); + } + VELOX_UNSUPPORTED( fmt::format("Unsupported trace node: {}", traceNode->name())); } diff --git a/velox/exec/Unnest.cpp b/velox/exec/Unnest.cpp index 719ade16bfba..691ed69c7c2f 100644 --- a/velox/exec/Unnest.cpp +++ b/velox/exec/Unnest.cpp @@ -42,7 +42,10 @@ Unnest::Unnest( unnestNode->id(), "Unnest"), withOrdinality_(unnestNode->withOrdinality()), - maxOutputSize_(outputBatchRows()) { + maxOutputSize_( + driverCtx->queryConfig().unnestSplitOutput() + ? outputBatchRows() + : std::numeric_limits::max()) { const auto& inputType = unnestNode->sources()[0]->outputType(); const auto& unnestVariables = unnestNode->unnestVariables(); for (const auto& variable : unnestVariables) { diff --git a/velox/exec/Unnest.h b/velox/exec/Unnest.h index 883e4c75fa65..bd9b27e86b58 100644 --- a/velox/exec/Unnest.h +++ b/velox/exec/Unnest.h @@ -136,6 +136,7 @@ class Unnest : public Operator { // The maximum number of output batch rows. const uint32_t maxOutputSize_; + BufferPtr maxSizes_; vector_size_t* rawMaxSizes_{nullptr}; diff --git a/velox/exec/fuzzer/CMakeLists.txt b/velox/exec/fuzzer/CMakeLists.txt index 50b945562b7a..18443f26819a 100644 --- a/velox/exec/fuzzer/CMakeLists.txt +++ b/velox/exec/fuzzer/CMakeLists.txt @@ -142,7 +142,9 @@ target_link_libraries( velox_dwio_faulty_file_sink velox_file_test_utils) -add_library(velox_memory_arbitration_fuzzer MemoryArbitrationFuzzer.cpp) +# Arbitration Fuzzer. +add_executable(velox_memory_arbitration_fuzzer MemoryArbitrationFuzzerRunner.cpp + MemoryArbitrationFuzzer.cpp) target_link_libraries( velox_memory_arbitration_fuzzer diff --git a/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp b/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp index 4aaeee65b887..c7b7eb006d73 100644 --- a/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp +++ b/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp @@ -98,7 +98,7 @@ DEFINE_int32( using namespace facebook::velox::tests::utils; -namespace facebook::velox::exec::test { +namespace facebook::velox::exec { namespace { using fuzzer::coinToss; @@ -147,7 +147,7 @@ class MemoryArbitrationFuzzer { return boost::random::uniform_int_distribution(min, max)(rng_); } - std::shared_ptr maybeGenerateFaultySpillDirectory(); + std::shared_ptr maybeGenerateFaultySpillDirectory(); // Returns a list of randomly generated key types for join and aggregation. std::vector generateKeyTypes(int32_t numKeys); @@ -274,7 +274,7 @@ MemoryArbitrationFuzzer::MemoryArbitrationFuzzer(size_t initialSeed) connector::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( - kHiveConnectorId, + test::kHiveConnectorId, std::make_shared(std::move(hiveConfig))); connector::registerConnector(hiveConnector); dwrf::registerDwrfReaderFactory(); @@ -445,7 +445,7 @@ MemoryArbitrationFuzzer::hashJoinPlans( (core::isLeftSemiProjectJoin(joinType) || core::isLeftSemiFilterJoin(joinType) || core::isAntiJoin(joinType)) ? asRowType(probeInput[0]->type())->names() - : concat( + : test::concat( asRowType(probeInput[0]->type()), asRowType(buildInput[0]->type())) ->names(); @@ -456,22 +456,23 @@ MemoryArbitrationFuzzer::hashJoinPlans( std::vector plans; auto planNodeIdGenerator = std::make_shared(); - auto plan = - PlanBuilder(planNodeIdGenerator) - .values(probeInput) - .hashJoin( - probeKeys, - buildKeys, - PlanBuilder(planNodeIdGenerator).values(buildInput).planNode(), - /*filter=*/"", - outputColumns, - joinType, - false) - .planNode(); + auto plan = test::PlanBuilder(planNodeIdGenerator) + .values(probeInput) + .hashJoin( + probeKeys, + buildKeys, + test::PlanBuilder(planNodeIdGenerator) + .values(buildInput) + .planNode(), + /*filter=*/"", + outputColumns, + joinType, + false) + .planNode(); plans.push_back(PlanWithSplits{std::move(plan), {}}); - if (!isTableScanSupported(probeInput[0]->type()) || - !isTableScanSupported(buildInput[0]->type())) { + if (!test::isTableScanSupported(probeInput[0]->type()) || + !test::isTableScanSupported(buildInput[0]->type())) { return plans; } @@ -480,13 +481,13 @@ MemoryArbitrationFuzzer::hashJoinPlans( const auto buildType = asRowType(buildInput[0]->type()); core::PlanNodeId probeScanId; core::PlanNodeId buildScanId; - plan = PlanBuilder(planNodeIdGenerator) + plan = test::PlanBuilder(planNodeIdGenerator) .tableScan(probeType) .capturePlanNodeId(probeScanId) .hashJoin( probeKeys, buildKeys, - PlanBuilder(planNodeIdGenerator) + test::PlanBuilder(planNodeIdGenerator) .tableScan(buildType) .capturePlanNodeId(buildScanId) .planNode(), @@ -513,14 +514,14 @@ MemoryArbitrationFuzzer::hashJoinPlans(const std::string& tableDir) { const auto numKeys = randInt(1, 5); const std::vector keyTypes = generateKeyTypes(numKeys); - std::vector probeKeys = makeNames("t", keyTypes.size()); - std::vector buildKeys = makeNames("u", keyTypes.size()); + std::vector probeKeys = test::makeNames("t", keyTypes.size()); + std::vector buildKeys = test::makeNames("u", keyTypes.size()); const auto probeInput = generateProbeInput(probeKeys, keyTypes); const auto buildInput = generateBuildInput(probeInput, probeKeys, buildKeys); - const std::vector probeScanSplits = - makeSplits(probeInput, fmt::format("{}/probe", tableDir), writerPool_); - const std::vector buildScanSplits = - makeSplits(buildInput, fmt::format("{}/build", tableDir), writerPool_); + const std::vector probeScanSplits = test::makeSplits( + probeInput, fmt::format("{}/probe", tableDir), writerPool_); + const std::vector buildScanSplits = test::makeSplits( + buildInput, fmt::format("{}/build", tableDir), writerPool_); std::vector totalPlans; for (const auto& joinType : kJoinTypes) { @@ -545,10 +546,11 @@ MemoryArbitrationFuzzer::aggregatePlans(const std::string& tableDir) { const auto numKeys = randInt(1, 5); // Reuse the hash join utilities to generate aggregation keys and inputs. const std::vector keyTypes = generateKeyTypes(numKeys); - const std::vector groupingKeys = makeNames("g", keyTypes.size()); + const std::vector groupingKeys = + test::makeNames("g", keyTypes.size()); const auto aggregateInput = generateAggregateInput(groupingKeys, keyTypes); const std::vector aggregates{"count(1)"}; - const std::vector splits = makeSplits( + const std::vector splits = test::makeSplits( aggregateInput, fmt::format("{}/aggregate", tableDir), writerPool_); std::vector plans; @@ -559,7 +561,7 @@ MemoryArbitrationFuzzer::aggregatePlans(const std::string& tableDir) { std::make_shared(); core::PlanNodeId scanId; auto plan = PlanWithSplits{ - PlanBuilder(planNodeIdGenerator) + test::PlanBuilder(planNodeIdGenerator) .tableScan(inputRowType) .capturePlanNodeId(scanId) .singleAggregation(groupingKeys, aggregates, {}) @@ -568,7 +570,7 @@ MemoryArbitrationFuzzer::aggregatePlans(const std::string& tableDir) { plans.push_back(std::move(plan)); plan = PlanWithSplits{ - PlanBuilder() + test::PlanBuilder() .values(aggregateInput) .singleAggregation(groupingKeys, aggregates, {}) .planNode(), @@ -582,7 +584,7 @@ MemoryArbitrationFuzzer::aggregatePlans(const std::string& tableDir) { std::make_shared(); core::PlanNodeId scanId; auto plan = PlanWithSplits{ - PlanBuilder(planNodeIdGenerator) + test::PlanBuilder(planNodeIdGenerator) .tableScan(inputRowType) .capturePlanNodeId(scanId) .partialAggregation(groupingKeys, aggregates, {}) @@ -592,7 +594,7 @@ MemoryArbitrationFuzzer::aggregatePlans(const std::string& tableDir) { plans.push_back(std::move(plan)); plan = PlanWithSplits{ - PlanBuilder() + test::PlanBuilder() .values(aggregateInput) .partialAggregation(groupingKeys, aggregates, {}) .finalAggregation() @@ -607,7 +609,7 @@ MemoryArbitrationFuzzer::aggregatePlans(const std::string& tableDir) { std::make_shared(); core::PlanNodeId scanId; auto plan = PlanWithSplits{ - PlanBuilder(planNodeIdGenerator) + test::PlanBuilder(planNodeIdGenerator) .tableScan(inputRowType) .capturePlanNodeId(scanId) .partialAggregation(groupingKeys, aggregates, {}) @@ -618,7 +620,7 @@ MemoryArbitrationFuzzer::aggregatePlans(const std::string& tableDir) { plans.push_back(std::move(plan)); plan = PlanWithSplits{ - PlanBuilder() + test::PlanBuilder() .values(aggregateInput) .partialAggregation(groupingKeys, aggregates, {}) .intermediateAggregation() @@ -641,7 +643,7 @@ MemoryArbitrationFuzzer::rowNumberPlans(const std::string& tableDir) { std::vector projectFields = keyNames; projectFields.emplace_back("row_number"); auto plan = PlanWithSplits{ - PlanBuilder() + test::PlanBuilder() .values(input) .rowNumber(keyNames) .project(projectFields) @@ -649,17 +651,17 @@ MemoryArbitrationFuzzer::rowNumberPlans(const std::string& tableDir) { {}}; plans.push_back(std::move(plan)); - if (!isTableScanSupported(input[0]->type())) { + if (!test::isTableScanSupported(input[0]->type())) { return plans; } - const std::vector splits = - makeSplits(input, fmt::format("{}/row_number", tableDir), writerPool_); + const std::vector splits = test::makeSplits( + input, fmt::format("{}/row_number", tableDir), writerPool_); auto planNodeIdGenerator = std::make_shared(); core::PlanNodeId scanId; plan = PlanWithSplits{ - PlanBuilder(planNodeIdGenerator) + test::PlanBuilder(planNodeIdGenerator) .tableScan(asRowType(input[0]->type())) .capturePlanNodeId(scanId) .rowNumber(keyNames) @@ -679,20 +681,21 @@ MemoryArbitrationFuzzer::orderByPlans(const std::string& tableDir) { std::vector plans; auto plan = PlanWithSplits{ - PlanBuilder().values(input).orderBy(keyNames, false).planNode(), {}}; + test::PlanBuilder().values(input).orderBy(keyNames, false).planNode(), + {}}; plans.push_back(std::move(plan)); - if (!isTableScanSupported(input[0]->type())) { + if (!test::isTableScanSupported(input[0]->type())) { return plans; } - const std::vector splits = - makeSplits(input, fmt::format("{}/order_by", tableDir), writerPool_); + const std::vector splits = test::makeSplits( + input, fmt::format("{}/order_by", tableDir), writerPool_); auto planNodeIdGenerator = std::make_shared(); core::PlanNodeId scanId; plan = PlanWithSplits{ - PlanBuilder(std::move(planNodeIdGenerator)) + test::PlanBuilder(std::move(planNodeIdGenerator)) .tableScan(asRowType(input[0]->type())) .capturePlanNodeId(scanId) .orderBy(keyNames, false) @@ -728,7 +731,7 @@ struct ThreadLocalStats { // Stats that keeps track of per thread execution status in verify() thread_local ThreadLocalStats threadLocalStats; -std::shared_ptr +std::shared_ptr MemoryArbitrationFuzzer::maybeGenerateFaultySpillDirectory() { FuzzerGenerator fsRng(rng_()); const auto injectFsFault = @@ -772,7 +775,7 @@ void MemoryArbitrationFuzzer::verify() { auto plans = allPlans(tableScanDir->getPath()); SCOPE_EXIT { - waitForAllTasksToBeDeleted(); + test::waitForAllTasksToBeDeleted(); if (auto faultyFileSystem = std::dynamic_pointer_cast( filesystems::getFileSystem(spillDirectory->getPath(), nullptr))) { faultyFileSystem->clearFileFaultInjections(); @@ -796,14 +799,14 @@ void MemoryArbitrationFuzzer::verify() { const auto queryId = fmt::format("query_id_{}", queryCount++); queryTaskAbortRequestMap.insert(queryId, false); try { - const auto queryCtx = newQueryCtx( + const auto queryCtx = test::newQueryCtx( memory::memoryManager(), executor_.get(), FLAGS_arbitrator_capacity, queryId); const auto plan = plans.at(getRandomIndex(rng, plans.size() - 1)); - AssertQueryBuilder builder(plan.plan); + test::AssertQueryBuilder builder(plan.plan); builder.queryCtx(queryCtx); for (const auto& [planNodeId, nodeSplits] : plan.splits) { builder.splits(planNodeId, nodeSplits); @@ -947,4 +950,4 @@ void MemoryArbitrationFuzzer::go() { void memoryArbitrationFuzzer(size_t seed) { MemoryArbitrationFuzzer(seed).go(); } -} // namespace facebook::velox::exec::test +} // namespace facebook::velox::exec diff --git a/velox/exec/fuzzer/MemoryArbitrationFuzzer.h b/velox/exec/fuzzer/MemoryArbitrationFuzzer.h index 73f32e2215d0..622d27084ae8 100644 --- a/velox/exec/fuzzer/MemoryArbitrationFuzzer.h +++ b/velox/exec/fuzzer/MemoryArbitrationFuzzer.h @@ -17,6 +17,6 @@ #include -namespace facebook::velox::exec::test { +namespace facebook::velox::exec { void memoryArbitrationFuzzer(size_t seed); } diff --git a/velox/exec/tests/MemoryArbitrationFuzzerTest.cpp b/velox/exec/fuzzer/MemoryArbitrationFuzzerRunner.cpp similarity index 65% rename from velox/exec/tests/MemoryArbitrationFuzzerTest.cpp rename to velox/exec/fuzzer/MemoryArbitrationFuzzerRunner.cpp index 1ad0247fb887..cfa1e928c1d1 100644 --- a/velox/exec/tests/MemoryArbitrationFuzzerTest.cpp +++ b/velox/exec/fuzzer/MemoryArbitrationFuzzerRunner.cpp @@ -16,13 +16,16 @@ #include #include -#include #include + +#include "velox/common/file/tests/FaultyFileSystem.h" +#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" +#include "velox/functions/prestosql/registration/RegistrationFunctions.h" + #include "velox/common/memory/SharedArbitrator.h" #include "velox/connectors/hive/HiveConnector.h" -#include "velox/exec/MemoryReclaimer.h" #include "velox/exec/fuzzer/FuzzerUtil.h" -#include "velox/exec/fuzzer/MemoryArbitrationFuzzerRunner.h" +#include "velox/exec/fuzzer/MemoryArbitrationFuzzer.h" #include "velox/exec/fuzzer/PrestoQueryRunner.h" #include "velox/exec/fuzzer/ReferenceQueryRunner.h" @@ -39,13 +42,17 @@ DEFINE_int64( using namespace facebook::velox::exec; int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - // Calls common init functions in the necessary order, initializing - // singletons, installing proper signal handlers for better debugging + // singletons, installing proper signal handlers for a better debugging // experience, and initialize glog and gflags. folly::Init init(&argc, &argv); test::setupMemory(FLAGS_allocator_capacity, FLAGS_arbitrator_capacity); - const size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed; - return test::MemoryArbitrationFuzzerRunner::run(initialSeed); + const size_t seed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed; + + facebook::velox::serializer::presto::PrestoVectorSerde::registerVectorSerde(); + facebook::velox::filesystems::registerLocalFileSystem(); + facebook::velox::tests::utils::registerFaultyFileSystem(); + facebook::velox::functions::prestosql::registerAllScalarFunctions(); + facebook::velox::aggregate::prestosql::registerAllAggregateFunctions(); + memoryArbitrationFuzzer(seed); } diff --git a/velox/exec/fuzzer/MemoryArbitrationFuzzerRunner.h b/velox/exec/fuzzer/MemoryArbitrationFuzzerRunner.h deleted file mode 100644 index dfe8144bb21e..000000000000 --- a/velox/exec/fuzzer/MemoryArbitrationFuzzerRunner.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include "velox/common/file/FileSystems.h" - -#include "velox/common/file/tests/FaultyFileSystem.h" -#include "velox/exec/fuzzer/MemoryArbitrationFuzzer.h" -#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" -#include "velox/functions/prestosql/registration/RegistrationFunctions.h" -#include "velox/serializers/PrestoSerializer.h" - -namespace facebook::velox::exec::test { - -class MemoryArbitrationFuzzerRunner { - public: - static int run(size_t seed) { - serializer::presto::PrestoVectorSerde::registerVectorSerde(); - filesystems::registerLocalFileSystem(); - tests::utils::registerFaultyFileSystem(); - functions::prestosql::registerAllScalarFunctions(); - aggregate::prestosql::registerAllAggregateFunctions(); - memoryArbitrationFuzzer(seed); - return RUN_ALL_TESTS(); - } -}; - -} // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt index bcedf172d2f3..148841c47318 100644 --- a/velox/exec/tests/CMakeLists.txt +++ b/velox/exec/tests/CMakeLists.txt @@ -219,14 +219,6 @@ target_link_libraries( velox_tpch_connector velox_memory) -# Arbitration Fuzzer. -add_executable(velox_memory_arbitration_fuzzer_test - MemoryArbitrationFuzzerTest.cpp) - -target_link_libraries( - velox_memory_arbitration_fuzzer_test velox_memory_arbitration_fuzzer - GTest::gtest GTest::gtest_main) - add_executable(velox_table_evolution_fuzzer_test TableEvolutionFuzzerTest.cpp TableEvolutionFuzzer.cpp) diff --git a/velox/exec/tests/IndexLookupJoinTest.cpp b/velox/exec/tests/IndexLookupJoinTest.cpp index 857ee78d61a8..fa1162a8d6b5 100644 --- a/velox/exec/tests/IndexLookupJoinTest.cpp +++ b/velox/exec/tests/IndexLookupJoinTest.cpp @@ -1903,6 +1903,9 @@ DEBUG_ONLY_TEST_P(IndexLookupJoinTest, runtimeStats) { ASSERT_EQ(runtimeStats.count(IndexLookupJoin::kClientResultProcessTime), 0); ASSERT_EQ(runtimeStats.count(IndexLookupJoin::kClientLookupResultSize), 0); ASSERT_EQ(runtimeStats.count(IndexLookupJoin::kClientLookupResultRawSize), 0); + ASSERT_EQ( + runtimeStats.count(IndexLookupJoin::kClientNumLazyDecodedResultBatches), + 0); ASSERT_THAT( operatorStats.toString(true, true), testing::MatchesRegex(".*Runtime stats.*connectorLookupWallNanos:.*")); diff --git a/velox/exec/tests/TaskTest.cpp b/velox/exec/tests/TaskTest.cpp index 0aa7d1ed8bea..6b2e0e50d6b6 100644 --- a/velox/exec/tests/TaskTest.cpp +++ b/velox/exec/tests/TaskTest.cpp @@ -3144,4 +3144,75 @@ TEST_F(TaskTest, testTerminateDuringBarrierWithUnion) { ASSERT_EQ(task->taskStats().numBarriers, 1); ASSERT_EQ(task->taskStats().numFinishedSplits, 3); } + +TEST_F(TaskTest, expressionStatsInBetweenBarriers) { + // Verify that expression stats are collected in between barriers and at the + // end. + const int numRows{10}; + auto data = makeRowVector({ + makeFlatVector(numRows, [](auto row) { return row; }), + }); + auto filePath = TempFilePath::create(); + writeToFile(filePath->getPath(), {data}); + + core::PlanNodeId scanId; + core::PlanNodeId projectNodeId; + auto plan = PlanBuilder() + .tableScan(asRowType(data->type())) + .capturePlanNodeId(scanId) + .project({"c0 + 1"}) + .capturePlanNodeId(projectNodeId) + .planFragment(); + + auto queryCtx = core::QueryCtx::create(); + queryCtx->testingOverrideConfigUnsafe( + {{core::QueryConfig::kMaxOutputBatchRows, "10"}, + {core::QueryConfig::kOperatorTrackExpressionStats, "true"}}); + const auto task = Task::create( + "expressionStatsInBetweenBarriers", + plan, + 0, + std::move(queryCtx), + Task::ExecutionMode::kSerial); + ASSERT_TRUE(!task->underBarrier()); + task->addSplit( + scanId, exec::Split(makeHiveConnectorSplit(filePath->getPath()))); + auto barrierFuture = task->requestBarrier(); + ASSERT_TRUE(task->underBarrier()); + RowVectorPtr result; + do { + ContinueFuture dummyFuture{ContinueFuture::makeEmpty()}; + result = task->next(&dummyFuture); + ASSERT_FALSE(dummyFuture.valid()); + } while (result != nullptr); + auto taskStats = task->taskStats(); + ASSERT_EQ(taskStats.numBarriers, 1); + ASSERT_EQ(taskStats.numFinishedSplits, 1); + auto verifyExpressionStats = [nodeId = projectNodeId]( + const TaskStats& taskStats, + uint64_t expectedNumProcessedRows) { + ASSERT_EQ(taskStats.pipelineStats.size(), 1); + ASSERT_EQ(taskStats.pipelineStats[0].operatorStats.size(), 2); + auto& projectStats = taskStats.pipelineStats[0].operatorStats[1]; + ASSERT_EQ(projectStats.planNodeId, nodeId); + auto& expressionStats = projectStats.expressionStats; + auto it = expressionStats.find("plus"); + ASSERT_TRUE(it != expressionStats.end()); + ASSERT_EQ(it->second.numProcessedRows, expectedNumProcessedRows); + }; + verifyExpressionStats(taskStats, 10); + ASSERT_TRUE(barrierFuture.isReady()); + barrierFuture.wait(); + task->addSplit( + scanId, exec::Split(makeHiveConnectorSplit(filePath->getPath()))); + task->noMoreSplits(scanId); + do { + result = task->next(); + } while (result != nullptr); + VELOX_CHECK(waitForTaskCompletion(task.get())); + taskStats = task->taskStats(); + ASSERT_EQ(taskStats.numFinishedSplits, 2); + verifyExpressionStats(taskStats, 20); +} + } // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/UnnestTest.cpp b/velox/exec/tests/UnnestTest.cpp index b429c54d626a..786896246932 100644 --- a/velox/exec/tests/UnnestTest.cpp +++ b/velox/exec/tests/UnnestTest.cpp @@ -681,7 +681,7 @@ TEST_P(UnnestTest, barrier) { const int numExpectedOutputVectors = bits::divRoundUp(numRowsPerSplit * 3, testData.numOutputRows) * numSplits; - auto task = AssertQueryBuilder(plan, duckDbQueryRunner_) + auto task = AssertQueryBuilder(plan) .config(core::QueryConfig::kSparkPartitionId, "0") .config( core::QueryConfig::kMaxSplitPreloadPerDriver, @@ -707,6 +707,64 @@ TEST_P(UnnestTest, barrier) { } } +TEST_P(UnnestTest, spiltOutput) { + std::vector vectors; + const auto numBatches = 5; + const auto inputBatchSize = 2048; + for (int32_t i = 0; i < 5; ++i) { + auto vector = makeRowVector({ + makeFlatVector(inputBatchSize, [](auto row) { return row; }), + }); + vectors.push_back(vector); + } + createDuckDbTable(vectors); + + // Unnest 1K rows into 3K rows. + auto planNodeIdGenerator = std::make_shared(); + core::PlanNodeId unnestPlanNodeId; + const auto plan = PlanBuilder(planNodeIdGenerator) + .values(vectors) + .project({"sequence(1, 3) as s"}) + .unnest({}, {"s"}) + .capturePlanNodeId(unnestPlanNodeId) + .planNode(); + + const auto expectedResult = makeRowVector({ + makeFlatVector( + numBatches * 3 * inputBatchSize, + [](auto row) { return 1 + row % 3; }), + }); + + struct { + bool produceSingleOutput; + int expectedNumOutputExectors; + + std::string toString() const { + return fmt::format( + "produceSingleOutput {}, expectedNumOutputExectors {}", + produceSingleOutput, + expectedNumOutputExectors); + } + } testSettings[] = { + {true, numBatches}, + {false, bits::divRoundUp(inputBatchSize * 3, GetParam()) * numBatches}}; + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.toString()); + auto task = AssertQueryBuilder(plan) + .config( + core::QueryConfig::kPreferredOutputBatchRows, + std::to_string(GetParam())) + .config( + core::QueryConfig::kUnnestSplitOutput, + testData.produceSingleOutput ? "false" : "true") + .assertResults(expectedResult); + const auto taskStats = task->taskStats(); + ASSERT_EQ( + exec::toPlanStats(taskStats).at(unnestPlanNodeId).outputVectors, + testData.expectedNumOutputExectors); + } +} + VELOX_INSTANTIATE_TEST_SUITE_P( UnnestTest, UnnestTest, diff --git a/velox/exec/tests/utils/TableScanTestBase.cpp b/velox/exec/tests/utils/TableScanTestBase.cpp index 67b64723310b..ef571d75b6d3 100644 --- a/velox/exec/tests/utils/TableScanTestBase.cpp +++ b/velox/exec/tests/utils/TableScanTestBase.cpp @@ -1,4 +1,18 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "velox/exec/tests/utils/TableScanTestBase.h" diff --git a/velox/experimental/breeze/test/generator_common.py b/velox/experimental/breeze/test/generator_common.py index 9bb2aac382de..138ca3d7eeeb 100644 --- a/velox/experimental/breeze/test/generator_common.py +++ b/velox/experimental/breeze/test/generator_common.py @@ -71,8 +71,8 @@ if libclang_python: sys.path.append(libclang_python) -import clang.cindex -from clang.cindex import CursorKind +import clang.cindex # noqa E402 +from clang.cindex import CursorKind # noqa E402 if libclang_path: clang.cindex.Config.set_library_path(libclang_path) diff --git a/velox/experimental/breeze/test/kernel_generator.py b/velox/experimental/breeze/test/kernel_generator.py index c6654996382a..a4cedfa59c76 100755 --- a/velox/experimental/breeze/test/kernel_generator.py +++ b/velox/experimental/breeze/test/kernel_generator.py @@ -34,7 +34,7 @@ from abc import ABC, abstractmethod -AUTOGEN_HEADER = f"""/* +AUTOGEN_HEADER = """/* * This file is auto-generated from kernel_generator.py * DO NOT EDIT! */ @@ -106,7 +106,7 @@ def generate(self, tu, filename): if self.use_namespace: out.write("namespace kernels {\n\n") warp_threads = self.num_warp_threads() - if warp_threads != None: + if warp_threads is not None: out.write(f"enum {{ WARP_THREADS = {warp_threads} }};") for kernel in kernels: kernel_name = kernel["spelling"] diff --git a/velox/experimental/breeze/test/test_fixture_generator.py b/velox/experimental/breeze/test/test_fixture_generator.py index cbb6d7d19e81..3201aa6c32d0 100755 --- a/velox/experimental/breeze/test/test_fixture_generator.py +++ b/velox/experimental/breeze/test/test_fixture_generator.py @@ -34,7 +34,7 @@ import subprocess -AUTOGEN_HEADER = f"""/* +AUTOGEN_HEADER = """/* * This file is auto-generated from test_fixture_generator.py * DO NOT EDIT! */ @@ -263,7 +263,7 @@ def body(self, method, fixture_type_param, template_params, function_params): {preamble} {self.launcher_fn}<{thread_count}{maybe_add_shared_mem}>( {block_count}, - &kernels::{method['spelling']}{kernel_template_args}{kernel_args} + &kernels::{method["spelling"]}{kernel_template_args}{kernel_args} ); {postamble}""" @@ -414,7 +414,7 @@ def body(self, method, fixture_type_param, template_params, function_params): lambda_params = ", ".join(lambda_params) lambda_fn = f"""\ [{size_arg}]({lambda_params}){{ - kernels::{method['spelling']}{kernel_template_args}({inner_kernel_args}); + kernels::{method["spelling"]}{kernel_template_args}({inner_kernel_args}); }}\ """ return f""" @@ -430,7 +430,7 @@ def __init__(self): self.launcher_fn = "OpenCLTestDispatch" def includes(self, fixture_name): - return f""" + return """ #include #include "test/platforms/opencl_test.h" @@ -541,7 +541,7 @@ def __init__(self): self.launcher_fn = "MetalTestDispatch" def includes(self, fixture_name): - return f""" + return """ #include #include "test/platforms/metal_test.h" diff --git a/velox/experimental/cudf/.clang-format b/velox/experimental/cudf/.clang-format index 7b028e6ff684..3cd9f5e421ff 100644 --- a/velox/experimental/cudf/.clang-format +++ b/velox/experimental/cudf/.clang-format @@ -24,4 +24,4 @@ IncludeCategories: - Regex: '^<.*\..*' # other system includes (e.g. with a '.') Priority: 9 - Regex: '^<[^.]+' # STL includes (no '.') - Priority: 10 \ No newline at end of file + Priority: 10 diff --git a/velox/experimental/wave/README.md b/velox/experimental/wave/README.md index 008e48b50f4c..9704aaee0ea9 100644 --- a/velox/experimental/wave/README.md +++ b/velox/experimental/wave/README.md @@ -17,17 +17,16 @@ limitations under the License. # CMake: Use Base Functions > [!IMPORTANT] Please use `target_link_libraries` and `add_library` -> instead of the `velox_*` functions when adding or linking to targets +> instead of the `velox_*` functions when adding or linking to targets > within wave/ and label tests with `cuda_driver`. The `wave` GPU component links against the CUDA driver in several targets. They can be built on machines without the actual driver installed, this requires the relevant 'stub' packages to be installed (see setup scripts). -Any library that statically links against the stubs **can not** run on a -machine without an actual CUDA driver installed (like our CI). -For this reason we need to use the base functions to create standalone +Any library that statically links against the stubs **can not** run on a +machine without an actual CUDA driver installed (like our CI). +For this reason we need to use the base functions to create standalone libraries for wave to avoid linking statically against the stubs when building the monolithic library and label any tests with 'cuda_driver' to allow excluding them from ctest on machines without the driver. - diff --git a/velox/experimental/wave/exec/tests/HashJoinTest.cpp b/velox/experimental/wave/exec/tests/HashJoinTest.cpp index ee1b06c83498..cf1590168bd9 100644 --- a/velox/experimental/wave/exec/tests/HashJoinTest.cpp +++ b/velox/experimental/wave/exec/tests/HashJoinTest.cpp @@ -124,3 +124,66 @@ TEST_F(HashJoinTest, manyHits) { "SELECT t_k1, t_k2, t_data, u_k1, u_k2, u_data FROM t, u WHERE t_k1 = u_k1 AND t_k2 = u_k2") .run(); } + +TEST_F(HashJoinTest, DISABLED_twoKeysLeft) { + probeType_ = ROW({"t_k1", "t_k2", "t_data"}, {BIGINT(), BIGINT(), BIGINT()}); + buildType_ = ROW({"u_k1", "u_k2", "u_data"}, {BIGINT(), BIGINT(), BIGINT()}); + + auto build = makeRowVector( + {"u_k1", "u_k2", "u_data"}, + {makeFlatVector(1000, [&](auto r) { return r; }), + makeFlatVector(1000, [&](auto r) { return r; }), + makeFlatVector(1000, [&](auto r) { return r; })}); + auto probe = makeRowVector( + {"t_k1", "t_k2", "t_data"}, + {makeFlatVector(1000, [&](auto r) { return r + 100; }), + makeFlatVector(1000, [&](auto r) { return r + 100; }), + makeFlatVector(1000, [&](auto r) { return r + 2; })}); + + HashJoinBuilder(*pool_, duckDbQueryRunner_, driverExecutor_.get()) + .numDrivers(1) + .probeType(probeType_) + .probeKeys({"t_k1", "t_k2"}) + .probeVectors({probe}) + .joinType(core::JoinType::kLeft) + .buildType(buildType_) + .buildKeys({"u_k1", "u_k2"}) + .buildVectors({build}) + .injectSpill(false) + .referenceQuery( + "SELECT t_k1, t_k2, t_data, u_k1, u_k2, u_data FROM t LEFT JOIN u ON t_k1 = u_k1 AND t_k2 = u_k2") + .run(); +} + +TEST_F(HashJoinTest, DISABLED_manyHitsLeft) { + probeType_ = ROW({"t_k1", "t_k2", "t_data"}, {BIGINT(), BIGINT(), BIGINT()}); + buildType_ = ROW({"u_k1", "u_k2", "u_data"}, {BIGINT(), BIGINT(), BIGINT()}); + + int32_t numRepeats = 20; + auto build = makeRowVector( + {"u_k1", "u_k2", "u_data"}, + {makeFlatVector( + 15000, [&](auto r) { return (r / numRepeats) * 9; }), + makeFlatVector( + 15000, [&](auto r) { return (r / numRepeats) * 9; }), + makeFlatVector(15000, [&](auto r) { return r; })}); + auto probe = makeRowVector( + {"t_k1", "t_k2", "t_data"}, + {makeFlatVector(1000, [&](auto r) { return r * 3; }), + makeFlatVector(1000, [&](auto r) { return r * 3; }), + makeFlatVector(1000, [&](auto r) { return r; })}); + + HashJoinBuilder(*pool_, duckDbQueryRunner_, driverExecutor_.get()) + .numDrivers(1) + .probeType(probeType_) + .probeKeys({"t_k1", "t_k2"}) + .probeVectors({probe}) + .joinType(core::JoinType::kLeft) + .buildType(buildType_) + .buildKeys({"u_k1", "u_k2"}) + .buildVectors({build}) + .injectSpill(false) + .referenceQuery( + "SELECT t_k1, t_k2, t_data, u_k1, u_k2, u_data FROM t LEFT JOIN u ON t_k1 = u_k1 AND t_k2 = u_k2") + .run(); +} diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp index 54c6ff629480..913fee074d10 100644 --- a/velox/expression/Expr.cpp +++ b/velox/expression/Expr.cpp @@ -1683,11 +1683,7 @@ void Expr::appendInputsSql( } bool Expr::isConstant() const { - if (!isDeterministic()) { - return false; - } - - return distinctFields_.empty(); + return isDeterministic() && distinctFields_.empty(); } namespace { @@ -2041,25 +2037,25 @@ core::ExecCtx* SimpleExpressionEvaluator::ensureExecCtx() { return execCtx_.get(); } -VectorPtr evaluateConstantExpression( - const core::TypedExprPtr& expr, - memory::MemoryPool* pool) { - auto result = tryEvaluateConstantExpression(expr, pool); - VELOX_USER_CHECK_NOT_NULL( - result, "Expression is not constant-foldable: {}", expr->toString()); - return result; -} - VectorPtr tryEvaluateConstantExpression( const core::TypedExprPtr& expr, - memory::MemoryPool* pool) { - auto data = BaseVector::create(ROW({}), 1, pool); - + memory::MemoryPool* pool, + bool suppressEvaluationFailures) { auto queryCtx = velox::core::QueryCtx::create(); velox::core::ExecCtx execCtx{pool, queryCtx.get()}; velox::exec::ExprSet exprSet({expr}, &execCtx); - if (exprSet.expr(0)->is()) { + // The construction of ExprSet involves compiling and constant folding the + // expression. If constant folding succeeded, then we get a ConstantExpr. + // Constant folding may fail because expression is not constant-foldable or if + // an error happened during evaluation (5 / 0 fails with "division by zero"). + // If constant folding didn't succeed, but suppressEvaluationFailures is + // false, we need to re-evaluate the expression to propagate the failure. + const bool doEvaluate = exprSet.expr(0)->is() || + (!suppressEvaluationFailures && exprSet.expr(0)->isConstant()); + + if (doEvaluate) { + auto data = BaseVector::create(ROW({}), 1, pool); velox::exec::EvalCtx evalCtx(&execCtx, &exprSet, data.get()); velox::SelectivityVector singleRow(1); std::vector results(1); diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h index fd7d462342de..98d72c425bc6 100644 --- a/velox/expression/Expr.h +++ b/velox/expression/Expr.h @@ -25,6 +25,7 @@ #include "velox/core/Expressions.h" #include "velox/expression/DecodedArgs.h" #include "velox/expression/EvalCtx.h" +#include "velox/expression/ExprStats.h" #include "velox/expression/VectorFunction.h" #include "velox/type/Subfield.h" #include "velox/vector/SimpleVector.h" @@ -35,38 +36,6 @@ class ExprSet; class FieldReference; class VectorFunction; -struct ExprStats { - /// Requires QueryConfig.exprTrackCpuUsage() to be 'true'. - CpuWallTiming timing; - - /// Number of processed rows. - uint64_t numProcessedRows{0}; - - /// Number of processed vectors / batches. Allows to compute average batch - /// size. - uint64_t numProcessedVectors{0}; - - /// Whether default-null behavior of an expression resulted in skipping - /// evaluation of rows. - bool defaultNullRowsSkipped{false}; - - void add(const ExprStats& other) { - timing.add(other.timing); - numProcessedRows += other.numProcessedRows; - numProcessedVectors += other.numProcessedVectors; - defaultNullRowsSkipped |= other.defaultNullRowsSkipped; - } - - std::string toString() const { - return fmt::format( - "timing: {}, numProcessedRows: {}, numProcessedVectors: {}, defaultNullRowsSkipped: {}", - timing.toString(), - numProcessedRows, - numProcessedVectors, - defaultNullRowsSkipped ? "true" : "false"); - } -}; - /// Maintains a set of rows for evaluation and removes rows with /// nulls or errors as needed. Helps to avoid copying SelectivityVector in cases /// when evaluation doesn't encounter nulls or errors. @@ -836,19 +805,18 @@ std::unique_ptr makeExprSetFromFlag( std::vector&& source, core::ExecCtx* execCtx); -/// Evaluates a deterministic expression that doesn't depend on any inputs and -/// returns the result as single-row vector. Throws if expression is -/// non-deterministic or has dependencies. -VectorPtr evaluateConstantExpression( - const core::TypedExprPtr& expr, - memory::MemoryPool* pool); - /// Evaluates a deterministic expression that doesn't depend on any inputs and /// returns the result as single-row vector. Returns nullptr if the expression /// is non-deterministic or has dependencies. +/// +/// By default, propagates failures that occur during evaluation of the +/// expression. For example, evaluating 5 / 0 throws "division by zero". If +/// 'suppressEvaluationFailures' is true, these failures are swallowed and the +/// caller receives a nullptr result. VectorPtr tryEvaluateConstantExpression( const core::TypedExprPtr& expr, - memory::MemoryPool* pool); + memory::MemoryPool* pool, + bool suppressEvaluationFailures = false); /// Returns a string representation of the expression trees annotated with /// runtime statistics. Expected to be called after calling ExprSet::eval one or diff --git a/velox/expression/ExprStats.h b/velox/expression/ExprStats.h new file mode 100644 index 000000000000..2d285b33ac14 --- /dev/null +++ b/velox/expression/ExprStats.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/common/time/CpuWallTimer.h" + +namespace facebook::velox::exec { + +struct ExprStats { + /// Requires QueryConfig.exprTrackCpuUsage() to be 'true'. + CpuWallTiming timing; + + /// Number of processed rows. + uint64_t numProcessedRows{0}; + + /// Number of processed vectors / batches. Allows to compute average batch + /// size. + uint64_t numProcessedVectors{0}; + + /// Whether default-null behavior of an expression resulted in skipping + /// evaluation of rows. + bool defaultNullRowsSkipped{false}; + + void add(const ExprStats& other) { + timing.add(other.timing); + numProcessedRows += other.numProcessedRows; + numProcessedVectors += other.numProcessedVectors; + defaultNullRowsSkipped |= other.defaultNullRowsSkipped; + } + + std::string toString() const { + return fmt::format( + "timing: {}, numProcessedRows: {}, numProcessedVectors: {}, defaultNullRowsSkipped: {}", + timing.toString(), + numProcessedRows, + numProcessedVectors, + defaultNullRowsSkipped ? "true" : "false"); + } +}; +} // namespace facebook::velox::exec diff --git a/velox/expression/fuzzer/ExpressionFuzzerTest.cpp b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp index 35957d86361f..c5e9357dcf26 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerTest.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp @@ -136,6 +136,8 @@ int main(int argc, char** argv) { "st_asbinary", "st_boundary", "st_centroid", + "st_distance", + "st_geometrytype", "st_relate", "st_contains", "st_crosses", diff --git a/velox/expression/signature_parser/SignatureParser.ll b/velox/expression/signature_parser/SignatureParser.ll index d11063122865..6ba5006b2da0 100644 --- a/velox/expression/signature_parser/SignatureParser.ll +++ b/velox/expression/signature_parser/SignatureParser.ll @@ -5,6 +5,30 @@ #include "velox/expression/signature_parser/SignatureParser.yy.h" // @manual #include "velox/expression/signature_parser/Scanner.h" #define YY_DECL int facebook::velox::exec::Scanner::lex(facebook::velox::exec::Parser::semantic_type *yylval) + +std::string unescape_doublequote(const char* yytext) { + size_t len = strlen(yytext); + std::string output; + output.resize(len); + + int i = 0; + int j = 0; + + while (i < len - 1) { + if (yytext[i] == '"' && yytext[i+1] == '"') { + output[j++] = '"'; + i += 2; + } else { + output[j++] = yytext[i++]; + } + } + // Check if the last character needs to be added. + if (i < len) { + output[j++] = yytext[i++]; + } + output.resize(j); + return output; +} %} %option c++ noyywrap noyylineno nodefault caseless @@ -34,7 +58,7 @@ Y [Y|y] Z [Z|z] WORD ([[:alnum:]_]*) -QUOTED_ID (['"'][[:alnum:][:space:]_]*['"']) +QUOTED_ID (\"([^\"\n]|\"\")*\") ROW (ROW|STRUCT) %% @@ -48,7 +72,7 @@ ROW (ROW|STRUCT) (DECIMAL) yylval->build(YYText()); return Parser::token::DECIMAL; {ROW} return Parser::token::ROW; {WORD} yylval->build(YYText()); return Parser::token::WORD; -{QUOTED_ID} yylval->build(YYText()); return Parser::token::QUOTED_ID; +{QUOTED_ID} {auto val = unescape_doublequote(YYText()); yylval->build(val.c_str()); return Parser::token::QUOTED_ID;} <> return Parser::token::YYEOF; . /* no action on unmatched input */ diff --git a/velox/expression/signature_parser/tests/ParseTypeSignatureTest.cpp b/velox/expression/signature_parser/tests/ParseTypeSignatureTest.cpp index daa0d8545269..2e06ec159a1e 100644 --- a/velox/expression/signature_parser/tests/ParseTypeSignatureTest.cpp +++ b/velox/expression/signature_parser/tests/ParseTypeSignatureTest.cpp @@ -230,6 +230,35 @@ TEST_F(ParseTypeSignatureTest, row) { ASSERT_EQ(rowfield.rowFieldName(), "bla"); ASSERT_EQ(rowfield.parameters().size(), 0); } + + { + auto signature = parseTypeSignature("row(\"a (b)\" INTEGER)"); + ASSERT_EQ(signature.baseName(), "row"); + ASSERT_EQ(signature.parameters().size(), 1); + auto field0 = signature.parameters()[0]; + ASSERT_EQ(field0.baseName(), "INTEGER"); + ASSERT_EQ(field0.rowFieldName(), "a (b)"); + } + + // Test double double quote escape + { + auto signature = parseTypeSignature( + "row(\"a\"\"b\" INTEGER, \"\"\"ab\" INTEGER, \"ab\"\"\"\"\" INTEGER)"); + ASSERT_EQ(signature.baseName(), "row"); + ASSERT_EQ(signature.parameters().size(), 3); + auto field0 = signature.parameters()[0]; + ASSERT_EQ(field0.baseName(), "INTEGER"); + ASSERT_EQ(field0.rowFieldName(), "a\"b"); + auto field1 = signature.parameters()[1]; + ASSERT_EQ(field1.baseName(), "INTEGER"); + ASSERT_EQ(field1.rowFieldName(), "\"ab"); + auto field2 = signature.parameters()[2]; + ASSERT_EQ(field2.baseName(), "INTEGER"); + ASSERT_EQ(field2.rowFieldName(), "ab\"\""); + } + + // Single double quote is an error. + EXPECT_THROW(parseTypeSignature("row(\"a\"b\" INTEGER)"), VeloxRuntimeError); } TEST_F(ParseTypeSignatureTest, tdigest) { diff --git a/velox/expression/tests/ExprTest.cpp b/velox/expression/tests/ExprTest.cpp index 0774a132ba48..ddef5a983922 100644 --- a/velox/expression/tests/ExprTest.cpp +++ b/velox/expression/tests/ExprTest.cpp @@ -4987,7 +4987,13 @@ TEST_F(ExprTest, disabledeferredLazyLoading) { TEST_F(ExprTest, evaluateConstantExpression) { auto eval = [&](const std::string& sql) { auto expr = parseExpression(sql, ROW({"a"}, {BIGINT()})); - return exec::evaluateConstantExpression(expr, pool()); + return exec::tryEvaluateConstantExpression(expr, pool()); + }; + + auto evalNoThrow = [&](const std::string& sql) { + auto expr = parseExpression(sql, ROW({"a"}, {BIGINT()})); + return exec::tryEvaluateConstantExpression( + expr, pool(), true /* supressEvaluationFailures */); }; assertEqualVectors(eval("1 + 2"), makeConstant(3, 1)); @@ -5009,28 +5015,23 @@ TEST_F(ExprTest, evaluateConstantExpression) { "try(coalesce(array_min_by(array[1, 2, 3], x -> x / 0), 0::INTEGER))"), makeNullConstant(TypeKind::INTEGER, 1)); - auto tryEval = [&](const std::string& sql) { - auto expr = parseExpression(sql, ROW({"a"}, {BIGINT()})); - return exec::tryEvaluateConstantExpression(expr, pool()); - }; + EXPECT_TRUE(eval("a + 1") == nullptr); - VELOX_ASSERT_THROW(eval("a + 1"), "Expression is not constant-foldable"); - ASSERT_TRUE(tryEval("a + 1") == nullptr); + EXPECT_TRUE(eval("rand() + 1.0") == nullptr); - VELOX_ASSERT_THROW( - eval("rand() + 1.0"), "Expression is not constant-foldable"); - ASSERT_TRUE(tryEval("rand() + 1.0") == nullptr); + EXPECT_TRUE(eval("transform(array[1, 2, 3], x -> (x * 2) + a)") == nullptr); - VELOX_ASSERT_THROW( - eval("transform(array[1, 2, 3], x -> (x * 2) + a)"), - "Expression is not constant-foldable"); - ASSERT_TRUE( - tryEval("transform(array[1, 2, 3], x -> (x * 2) + a)") == nullptr); + EXPECT_TRUE(eval("transform(array[1, 2, 3], x -> x + rand())") == nullptr); + + VELOX_ASSERT_THROW(eval("5 / 0"), "division by zero"); + EXPECT_TRUE(evalNoThrow("5 / 0") == nullptr); + + VELOX_ASSERT_THROW(eval("1 + 5 / 0"), "division by zero"); + EXPECT_TRUE(evalNoThrow("1 + 5 / 0") == nullptr); VELOX_ASSERT_THROW( - eval("transform(array[1, 2, 3], x -> x + rand())"), - "Expression is not constant-foldable"); - ASSERT_TRUE(tryEval("transform(array[1, 2, 3], x -> x + rand())") == nullptr); + eval("transform(array[1, 2, 3], x -> x / 0)"), "division by zero"); + EXPECT_TRUE(evalNoThrow("transform(array[1, 2, 3], x -> x / 0)") == nullptr); } TEST_F(ExprTest, isDeterministic) { diff --git a/velox/functions/lib/UpperLower.h b/velox/functions/lib/UpperLower.h new file mode 100644 index 000000000000..6306c0561284 --- /dev/null +++ b/velox/functions/lib/UpperLower.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/expression/StringWriter.h" +#include "velox/expression/VectorFunction.h" +#include "velox/functions/lib/StringEncodingUtils.h" +#include "velox/functions/lib/string/StringImpl.h" + +namespace facebook::velox::functions { + +/// Function to convert string to upper or lower case. Ascii and unicode +/// conversion are supported. +/// @tparam isLower Instantiate for upper or lower. +/// @tparam forSpark When true, Spark's specific behavior is considered, e.g. +/// for 'İ' Spark's lower case is 'i̇' and Presto's is 'i'. +template +class UpperLowerTemplateFunction : public exec::VectorFunction { + private: + // String encoding wrappable function. + template + struct ApplyInternal { + static void apply( + const SelectivityVector& rows, + const DecodedVector* decodedInput, + FlatVector* results) { + rows.applyToSelected([&](auto row) { + auto proxy = exec::StringWriter(results, row); + if constexpr (isLower) { + stringImpl::lower( + proxy, decodedInput->valueAt(row)); + } else { + stringImpl::upper( + proxy, decodedInput->valueAt(row)); + } + proxy.finalize(); + }); + } + }; + + public: + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& /*outputType*/, + exec::EvalCtx& context, + VectorPtr& result) const override { + VELOX_CHECK(args.size() == 1); + VELOX_CHECK(args[0]->typeKind() == TypeKind::VARCHAR); + + // Read content before calling prepare results. + BaseVector* inputStringsVector = args[0].get(); + exec::LocalDecodedVector inputHolder(context, *inputStringsVector, rows); + auto decodedInput = inputHolder.get(); + + auto ascii = isAscii(inputStringsVector, rows); + + // Not in place path. + VectorPtr emptyVectorPtr; + prepareFlatResultsVector(result, rows, context, emptyVectorPtr); + auto* resultFlatVector = result->as>(); + + StringEncodingTemplateWrapper::apply( + ascii, rows, decodedInput, resultFlatVector); + } + + static std::vector> signatures() { + // varchar -> varchar + return {exec::FunctionSignatureBuilder() + .returnType("varchar") + .argumentType("varchar") + .build()}; + } + + bool ensureStringEncodingSetAtAllInputs() const override { + return true; + } + + bool propagateStringEncodingFromAllInputs() const override { + return true; + } +}; + +} // namespace facebook::velox::functions diff --git a/velox/functions/lib/string/StringCore.h b/velox/functions/lib/string/StringCore.h index eff3008903e1..b1c8417a570b 100644 --- a/velox/functions/lib/string/StringCore.h +++ b/velox/functions/lib/string/StringCore.h @@ -172,6 +172,7 @@ FOLLY_ALWAYS_INLINE size_t upperUnicode( /// large enough for the results outputLength refers to the number of bytes /// available in the output buffer, and inputLength is the number of bytes in /// the input string +template FOLLY_ALWAYS_INLINE size_t lowerUnicode( char* output, size_t outputLength, @@ -194,6 +195,18 @@ FOLLY_ALWAYS_INLINE size_t lowerUnicode( } inputIdx += size; + + if constexpr (forSpark) { + // Handle Turkish-specific case for İ (U+0130). + if (UNLIKELY(nextCodePoint == 0x0130)) { + // Map to i̇ (U+0069 U+0307). + output[outputIdx++] = 0x69; + output[outputIdx++] = 0xCC; + output[outputIdx++] = 0x87; + continue; + } + } + auto lowerCodePoint = utf8proc_tolower(nextCodePoint); assert( diff --git a/velox/functions/lib/string/StringImpl.h b/velox/functions/lib/string/StringImpl.h index d1b8896fa01d..ff686ca40e9f 100644 --- a/velox/functions/lib/string/StringImpl.h +++ b/velox/functions/lib/string/StringImpl.h @@ -35,7 +35,11 @@ namespace facebook::velox::functions::stringImpl { using namespace stringCore; /// Perform upper for a UTF8 string -template +template < + bool ascii, + bool forSpark = false, + typename TOutString, + typename TInString> FOLLY_ALWAYS_INLINE bool upper(TOutString& output, const TInString& input) { if constexpr (ascii) { output.resize(input.size()); @@ -50,15 +54,19 @@ FOLLY_ALWAYS_INLINE bool upper(TOutString& output, const TInString& input) { } /// Perform lower for a UTF8 string -template +template < + bool ascii, + bool forSpark = false, + typename TOutString, + typename TInString> FOLLY_ALWAYS_INLINE bool lower(TOutString& output, const TInString& input) { if constexpr (ascii) { output.resize(input.size()); lowerAscii(output.data(), input.data(), input.size()); } else { output.resize(input.size() * 4); - auto size = - lowerUnicode(output.data(), output.size(), input.data(), input.size()); + auto size = lowerUnicode( + output.data(), output.size(), input.data(), input.size()); output.resize(size); } return true; diff --git a/velox/functions/lib/string/tests/StringImplTest.cpp b/velox/functions/lib/string/tests/StringImplTest.cpp index 545d457bcc76..7656bec5f053 100644 --- a/velox/functions/lib/string/tests/StringImplTest.cpp +++ b/velox/functions/lib/string/tests/StringImplTest.cpp @@ -42,7 +42,33 @@ class StringImplTest : public testing::Test { {"àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ"}, {"αβγδεζηθικλμνξοπρςστυφχψ", "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨ"}, {"абвгдежзийклмнопрстуфхцчшщъыьэюя", - "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"}}; + "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"}, + {"\u0069", "\u0049"}, + {"\u03C3", "\u03A3"}, + {"i\xCC\x87", "I\xCC\x87"}, + {"\u010B", "\u010A"}, + {"\u0117", "\u0116"}, + {"\u0121", "\u0120"}, + {"\u017C", "\u017B"}, + {"\u0227", "\u0226"}, + {"\u022F", "\u022E"}, + {"\u1E03", "\u1E02"}, + {"\u1E0B", "\u1E0A"}, + {"\u1E1F", "\u1E1E"}, + {"\u1E23", "\u1E22"}, + {"\u1E41", "\u1E40"}, + {"\u1E45", "\u1E44"}, + {"\u1E57", "\u1E56"}, + {"\u1E59", "\u1E58"}, + {"\u1E61", "\u1E60"}, + {"\u1E65", "\u1E64"}, + {"\u1E67", "\u1E66"}, + {"\u1E69", "\u1E68"}, + {"\u1E6B", "\u1E6A"}, + {"\u1E87", "\u1E86"}, + {"\u1E8B", "\u1E8A"}, + {"\u1E8F", "\u1E8E"}, + }; } std::vector> getLowerAsciiTestData() { @@ -58,7 +84,32 @@ class StringImplTest : public testing::Test { {"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ", "àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ"}, {"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨ", "αβγδεζηθικλμνξοπρσστυφχψ"}, {"АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", - "абвгдежзийклмнопрстуфхцчшщъыьэюя"}}; + "абвгдежзийклмнопрстуфхцчшщъыьэюя"}, + {"\u0130", "\u0069"}, + {"\u03A3", "\u03C3"}, + {"I\xCC\x87", "i\xCC\x87"}, + {"\u010A", "\u010B"}, + {"\u0116", "\u0117"}, + {"\u0120", "\u0121"}, + {"\u017B", "\u017C"}, + {"\u0226", "\u0227"}, + {"\u022E", "\u022F"}, + {"\u1E02", "\u1E03"}, + {"\u1E0A", "\u1E0B"}, + {"\u1E1E", "\u1E1F"}, + {"\u1E22", "\u1E23"}, + {"\u1E40", "\u1E41"}, + {"\u1E44", "\u1E45"}, + {"\u1E56", "\u1E57"}, + {"\u1E58", "\u1E59"}, + {"\u1E60", "\u1E61"}, + {"\u1E64", "\u1E65"}, + {"\u1E66", "\u1E67"}, + {"\u1E68", "\u1E69"}, + {"\u1E6A", "\u1E6B"}, + {"\u1E86", "\u1E87"}, + {"\u1E8A", "\u1E8B"}, + {"\u1E8E", "\u1E8F"}}; } }; diff --git a/velox/functions/prestosql/CMakeLists.txt b/velox/functions/prestosql/CMakeLists.txt index 34f48edc4dc6..f4c750672fce 100644 --- a/velox/functions/prestosql/CMakeLists.txt +++ b/velox/functions/prestosql/CMakeLists.txt @@ -55,6 +55,7 @@ velox_add_library( TransformKeys.cpp TransformValues.cpp TypeOf.cpp + UpperLower.cpp URIParser.cpp URLFunctions.cpp VectorArithmetic.cpp diff --git a/velox/functions/prestosql/GeometryFunctions.h b/velox/functions/prestosql/GeometryFunctions.h index c026c6b7ba51..d5bed71c7fdb 100644 --- a/velox/functions/prestosql/GeometryFunctions.h +++ b/velox/functions/prestosql/GeometryFunctions.h @@ -576,7 +576,7 @@ struct StXFunction { geospatial::deserializeGeometry(geometry); if (geosGeometry->getGeometryTypeId() != geos::geom::GeometryTypeId::GEOS_POINT) { - throw Status::UserError(fmt::format( + VELOX_USER_FAIL(fmt::format( "ST_X requires a Point geometry, found {}", geosGeometry->getGeometryType())); } @@ -600,7 +600,7 @@ struct StYFunction { geospatial::deserializeGeometry(geometry); if (geosGeometry->getGeometryTypeId() != geos::geom::GeometryTypeId::GEOS_POINT) { - throw Status::UserError(fmt::format( + VELOX_USER_FAIL(fmt::format( "ST_Y requires a Point geometry, found {}", geosGeometry->getGeometryType())); } @@ -712,4 +712,50 @@ struct SimplifyGeometryFunction { } }; +template +struct StGeometryTypeFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE Status + call(out_type& result, const arg_type& input) { + std::unique_ptr geosGeometry = + geospatial::deserializeGeometry(input); + + result = geosGeometry->getGeometryType(); + + return Status::OK(); + } +}; + +template +struct StDistanceFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE bool call( + out_type& result, + const arg_type& geometry1, + const arg_type& geometry2) { + std::unique_ptr geosGeometry1 = + geospatial::deserializeGeometry(geometry1); + std::unique_ptr geosGeometry2 = + geospatial::deserializeGeometry(geometry2); + + if (geosGeometry1->getSRID() != geosGeometry2->getSRID()) { + VELOX_USER_FAIL(fmt::format( + "Input geometries must have the same spatial reference, found {} and {}", + geosGeometry1->getSRID(), + geosGeometry2->getSRID())); + } + + if (geosGeometry1->isEmpty() || geosGeometry2->isEmpty()) { + return false; + } + + GEOS_RETHROW(result = geosGeometry1->distance(geosGeometry2.get()); + , "Failed to calculate geometry distance"); + + return true; + } +}; + } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/Probability.h b/velox/functions/prestosql/Probability.h index 6a2173c925ce..cb4dce56b2b6 100644 --- a/velox/functions/prestosql/Probability.h +++ b/velox/functions/prestosql/Probability.h @@ -97,6 +97,11 @@ struct BinomialCDFFunction { return; } + if (value >= numOfTrials) { + result = 1.0; + return; + } + boost::math::binomial_distribution<> dist(numOfTrials, successProb); result = boost::math::cdf(dist, value); } diff --git a/velox/functions/prestosql/StringFunctions.cpp b/velox/functions/prestosql/StringFunctions.cpp index 22207ad5274e..f29bf1581547 100644 --- a/velox/functions/prestosql/StringFunctions.cpp +++ b/velox/functions/prestosql/StringFunctions.cpp @@ -27,76 +27,6 @@ namespace facebook::velox::functions { using namespace stringCore; namespace { -/** - * Upper and Lower functions have a fast path for ascii where the functions - * can be applied in place. - * */ -template -class UpperLowerTemplateFunction : public exec::VectorFunction { - private: - /// String encoding wrappable function - template - struct ApplyInternal { - static void apply( - const SelectivityVector& rows, - const DecodedVector* decodedInput, - FlatVector* results) { - rows.applyToSelected([&](int row) { - auto proxy = exec::StringWriter(results, row); - if constexpr (isLower) { - stringImpl::lower( - proxy, decodedInput->valueAt(row)); - } else { - stringImpl::upper( - proxy, decodedInput->valueAt(row)); - } - proxy.finalize(); - }); - } - }; - - public: - void apply( - const SelectivityVector& rows, - std::vector& args, - const TypePtr& /* outputType */, - exec::EvalCtx& context, - VectorPtr& result) const override { - VELOX_CHECK(args.size() == 1); - VELOX_CHECK(args[0]->typeKind() == TypeKind::VARCHAR); - - // Read content before calling prepare results - BaseVector* inputStringsVector = args[0].get(); - exec::LocalDecodedVector inputHolder(context, *inputStringsVector, rows); - auto decodedInput = inputHolder.get(); - - auto ascii = isAscii(inputStringsVector, rows); - - // Not in place path. - VectorPtr emptyVectorPtr; - prepareFlatResultsVector(result, rows, context, emptyVectorPtr); - auto* resultFlatVector = result->as>(); - - StringEncodingTemplateWrapper::apply( - ascii, rows, decodedInput, resultFlatVector); - } - - static std::vector> signatures() { - // varchar -> varchar - return {exec::FunctionSignatureBuilder() - .returnType("varchar") - .argumentType("varchar") - .build()}; - } - - bool ensureStringEncodingSetAtAllInputs() const override { - return true; - } - - bool propagateStringEncodingFromAllInputs() const override { - return true; - } -}; /** * concat(string1, ..., stringN) → varchar @@ -387,16 +317,6 @@ class Replace : public exec::VectorFunction { }; } // namespace -VELOX_DECLARE_VECTOR_FUNCTION( - udf_upper, - UpperLowerTemplateFunction::signatures(), - std::make_unique>()); - -VELOX_DECLARE_VECTOR_FUNCTION( - udf_lower, - UpperLowerTemplateFunction::signatures(), - std::make_unique>()); - VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION_WITH_METADATA( udf_concat, ConcatFunction::signatures(), diff --git a/velox/functions/prestosql/UpperLower.cpp b/velox/functions/prestosql/UpperLower.cpp new file mode 100644 index 000000000000..dcb668ae2c28 --- /dev/null +++ b/velox/functions/prestosql/UpperLower.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/functions/lib/UpperLower.h" + +namespace facebook::velox::functions { + +using PrestoUpperFunction = + UpperLowerTemplateFunction; +using PrestoLowerFunction = + UpperLowerTemplateFunction; + +VELOX_DECLARE_VECTOR_FUNCTION( + udf_upper, + PrestoUpperFunction::signatures(), + (std::make_unique())); + +VELOX_DECLARE_VECTOR_FUNCTION( + udf_lower, + PrestoLowerFunction::signatures(), + (std::make_unique())); + +} // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/aggregates/AggregateNames.h b/velox/functions/prestosql/aggregates/AggregateNames.h index 1ae106ffda58..fd4a15bd85fd 100644 --- a/velox/functions/prestosql/aggregates/AggregateNames.h +++ b/velox/functions/prestosql/aggregates/AggregateNames.h @@ -58,6 +58,7 @@ const char* const kMin = "min"; const char* const kMinBy = "min_by"; const char* const kMultiMapAgg = "multimap_agg"; const char* const kNoisyCountIfGaussian = "noisy_count_if_gaussian"; +const char* const kNoisyCountGaussian = "noisy_count_gaussian"; const char* const kReduceAgg = "reduce_agg"; const char* const kRegrIntercept = "regr_intercept"; const char* const kRegrSlop = "regr_slope"; diff --git a/velox/functions/prestosql/aggregates/CMakeLists.txt b/velox/functions/prestosql/aggregates/CMakeLists.txt index a96790ace830..347bde781830 100644 --- a/velox/functions/prestosql/aggregates/CMakeLists.txt +++ b/velox/functions/prestosql/aggregates/CMakeLists.txt @@ -46,6 +46,7 @@ velox_add_library( ReduceAgg.cpp RegisterAggregateFunctions.cpp NoisyCountIfGaussianAggregate.cpp + NoisyCountGaussianAggregate.cpp SetAggregates.cpp SumAggregate.cpp SumDataSizeForStatsAggregate.cpp diff --git a/velox/functions/prestosql/aggregates/NoisyCountGaussianAggregate.cpp b/velox/functions/prestosql/aggregates/NoisyCountGaussianAggregate.cpp new file mode 100644 index 000000000000..effdb42a78c1 --- /dev/null +++ b/velox/functions/prestosql/aggregates/NoisyCountGaussianAggregate.cpp @@ -0,0 +1,304 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/functions/prestosql/aggregates/NoisyCountGaussianAggregate.h" +#include +#include "velox/exec/Aggregate.h" +#include "velox/expression/FunctionSignature.h" +#include "velox/functions/lib/aggregates/noisy_aggregation/NoisyCountAccumulator.h" +#include "velox/functions/prestosql/aggregates/AggregateNames.h" +#include "velox/vector/DecodedVector.h" +#include "velox/vector/FlatVector.h" + +using namespace facebook::velox::functions::aggregate; + +namespace facebook::velox::aggregate::prestosql { + +namespace { +class NoisyCountGaussianAggregate : public exec::Aggregate { + public: + explicit NoisyCountGaussianAggregate(TypePtr resultType) + : exec::Aggregate(std::move(resultType)) {} + + using AccumulatorType = NoisyCountAccumulator; + + int32_t accumulatorFixedWidthSize() const override { + return static_cast(sizeof(AccumulatorType)); + } + + bool isFixedSize() const override { + return true; + } + + void addRawInput( + char** groups, + const SelectivityVector& rows, + const std::vector& args, + [[maybe_unused]] bool mayPushdown) override { + decodeInputData(rows, args); + + // Process the args data and update the accumulator for each group. + rows.applyToSelected([&](vector_size_t i) { + // If value is null, we do not want to update the accumulator. + if (decodedValue_.isNullAt(i) || decodedNoiseScale_.isNullAt(i)) { + return; + } + + auto group = groups[i]; + auto accumulator = exec::Aggregate::value(group); + accumulator->increaseCount(1); + + double noiseScale = decodedNoiseScale_.valueAt(i); + accumulator->checkAndSetNoiseScale(noiseScale); + }); + } + + void extractAccumulators(char** groups, int32_t numGroups, VectorPtr* result) + override { + auto flatResult = (*result)->asFlatVector(); + flatResult->resize(numGroups); + + auto numOfValidGroups = 0; + for (auto i = 0; i < numGroups; i++) { + numOfValidGroups += !isNull(groups[i]); + } + size_t totalSize = numOfValidGroups * AccumulatorType::serializedSize(); + + // Allocate buffer for serialized data. + auto rawBuffer = flatResult->getRawStringBufferWithSpace(totalSize); + size_t offset = 0; + auto size = AccumulatorType::serializedSize(); + + for (auto i = 0; i < numGroups; i++) { + auto group = groups[i]; + if (isNull(group)) { + flatResult->setNull(i, true); + continue; + } + + auto accumulator = exec::Aggregate::value(group); + + // Write to the pre-allocated buffer. + accumulator->serialize(rawBuffer + offset); + flatResult->setNoCopy( + i, StringView(rawBuffer + offset, static_cast(size))); + offset += size; + } + } + + void addIntermediateResults( + char** groups, + const SelectivityVector& rows, + const std::vector& args, + [[maybe_unused]] bool mayPushdown) override { + auto decodedVector = DecodedVector(*args[0], rows); + + rows.applyToSelected([&](vector_size_t i) { + if (decodedVector.isNullAt(i)) { + return; + } + + auto group = groups[i]; + auto accumulator = exec::Aggregate::value(group); + + auto serialized = decodedVector.valueAt(i); + auto otherAccumulator = AccumulatorType::deserialize(serialized.data()); + + accumulator->increaseCount(otherAccumulator.count); + + if (accumulator->noiseScale != otherAccumulator.noiseScale && + otherAccumulator.noiseScale >= 0) { + accumulator->checkAndSetNoiseScale(otherAccumulator.noiseScale); + } + }); + } + + void extractValues(char** groups, int32_t numGroups, VectorPtr* result) + override { + auto* flatResult = (*result)->as>(); + VELOX_CHECK(flatResult); + flatResult->resize(numGroups); + + // Find the noise scale from group. + double noiseScale = -1; + for (auto i = 0; i < numGroups; i++) { + auto group = groups[i]; + if (!isNull(group)) { + auto* accumulator = value(group); + noiseScale = accumulator->noiseScale; + // In situations where the aggregated value is null but the group by key + // is not, we skipped updating the accumulator which means the noise + // scale of that group by is -1. We need to find a valid noise scale + // before break. + if (noiseScale >= 0) { + break; + } + } + } + + // If noise scale is never set, either the group is null or the input is + // empty, To be consistent with Java, return null for all groups. + if (noiseScale < 0) { + for (auto i = 0; i < numGroups; ++i) { + flatResult->setNull(i, true); + } + return; + } + + folly::Random::DefaultGenerator rng; + + // Create a normal distribution with mean 0 and standard deviation noise. + std::normal_distribution distribution{0.0, 1.0}; + if (noiseScale > 0) { + distribution = std::normal_distribution(0.0, noiseScale); + } + + for (auto i = 0; i < numGroups; i++) { + auto group = groups[i]; + if (isNull(group)) { + flatResult->set(i, 0); // Return 0 for null group to match Java behavior + } else { + auto* accumulator = value(group); + // If group by is not null but noise scale is invalid, it means + // that the input data for this group are nulls, we return null for + // this group instead of 0. + if (accumulator->noiseScale < 0) { + flatResult->setNull(i, true); + continue; + } + auto trueCount = static_cast(accumulator->count); + + // Add noise to the count. + int64_t noise = 0; + if (noiseScale > 0) { + noise = static_cast(std::round(distribution(rng))); + } + int64_t noisyCount = trueCount + noise; + + // Post-process the noisy count to make sure it is non-negative + flatResult->set(i, std::max(noisyCount, 0)); + } + } + } + + void addSingleGroupIntermediateResults( + char* group, + const SelectivityVector& rows, + const std::vector& args, + [[maybe_unused]] bool mayPushdown) override { + DecodedVector decodedVector(*args[0], rows); + auto accumulator = exec::Aggregate::value(group); + + rows.applyToSelected([&](vector_size_t i) { + if (decodedVector.isNullAt(i)) { + return; + } + + auto serialized = decodedVector.valueAt(i); + auto otherAccumulator = AccumulatorType::deserialize(serialized.data()); + + accumulator->increaseCount(otherAccumulator.count); + + if (accumulator->noiseScale != otherAccumulator.noiseScale && + otherAccumulator.noiseScale >= 0) { + accumulator->checkAndSetNoiseScale(otherAccumulator.noiseScale); + } + }); + } + + void addSingleGroupRawInput( + char* group, + const SelectivityVector& rows, + const std::vector& args, + [[maybe_unused]] bool mayPushdown) override { + decodeInputData(rows, args); + auto accumulator = exec::Aggregate::value(group); + + rows.applyToSelected([&](vector_size_t i) { + if (decodedValue_.isNullAt(i) || decodedNoiseScale_.isNullAt(i)) { + return; + } + + accumulator->increaseCount(1); + double noiseScale = decodedNoiseScale_.valueAt(i); + accumulator->checkAndSetNoiseScale(noiseScale); + }); + } + + protected: + void initializeNewGroupsInternal( + char** groups, + folly::Range indices) override { + // Initialize the accumulator for each group + for (auto i : indices) { + *value(groups[i]) = AccumulatorType(); + } + } + + // Helper function to decode the input data. + void decodeInputData( + const SelectivityVector& rows, + const std::vector& args) { + decodedValue_.decode(*args[0], rows); + decodedNoiseScale_.decode(*args[1], rows); + } + + private: + DecodedVector decodedValue_; + DecodedVector decodedNoiseScale_; +}; +} // namespace + +void registerNoisyCountGaussianAggregate( + const std::string& prefix, + bool withCompanionFunctions, + bool overwrite) { + std::vector> signatures{ + exec::AggregateFunctionSignatureBuilder() + .typeVariable("T") + .returnType("bigint") + .intermediateType("varbinary") + .argumentType("T") + .argumentType("double") // support DOUBLE noise scale + .build(), + }; + + auto name = prefix + kNoisyCountGaussian; + + exec::registerAggregateFunction( + name, + signatures, + [name]( + core::AggregationNode::Step step, + const std::vector& argTypes, + [[maybe_unused]] const TypePtr& resultType, + [[maybe_unused]] const core::QueryConfig& config) + -> std::unique_ptr { + VELOX_USER_CHECK_EQ( + argTypes.size(), 2, "{} takes exactly 2 arguments", name); + + if (exec::isPartialOutput(step)) { + return std::make_unique(VARBINARY()); + } + + return std::make_unique(BIGINT()); + }, + {false /*orderSensitive*/, false /*companionFunction*/}, + withCompanionFunctions, + overwrite); +} + +} // namespace facebook::velox::aggregate::prestosql diff --git a/velox/functions/prestosql/aggregates/NoisyCountGaussianAggregate.h b/velox/functions/prestosql/aggregates/NoisyCountGaussianAggregate.h new file mode 100644 index 000000000000..9bfda08708df --- /dev/null +++ b/velox/functions/prestosql/aggregates/NoisyCountGaussianAggregate.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace facebook::velox::aggregate::prestosql { + +void registerNoisyCountGaussianAggregate( + const std::string& prefix, + bool withCompanionFunctions, + bool overwrite); + +} // namespace facebook::velox::aggregate::prestosql diff --git a/velox/functions/prestosql/aggregates/RegisterAggregateFunctions.cpp b/velox/functions/prestosql/aggregates/RegisterAggregateFunctions.cpp index 8e717675da13..3175904600d1 100644 --- a/velox/functions/prestosql/aggregates/RegisterAggregateFunctions.cpp +++ b/velox/functions/prestosql/aggregates/RegisterAggregateFunctions.cpp @@ -41,6 +41,7 @@ #include "velox/functions/prestosql/aggregates/MinByAggregate.h" #include "velox/functions/prestosql/aggregates/MinMaxAggregates.h" #include "velox/functions/prestosql/aggregates/MultiMapAggAggregate.h" +#include "velox/functions/prestosql/aggregates/NoisyCountGaussianAggregate.h" #include "velox/functions/prestosql/aggregates/NoisyCountIfGaussianAggregate.h" #include "velox/functions/prestosql/aggregates/QDigestAggAggregate.h" #include "velox/functions/prestosql/aggregates/ReduceAgg.h" @@ -94,6 +95,8 @@ void registerAllAggregateFunctions( registerMinByAggregates(prefix, withCompanionFunctions, overwrite); registerNoisyCountIfGaussianAggregate( prefix, withCompanionFunctions, overwrite); + registerNoisyCountGaussianAggregate( + prefix, withCompanionFunctions, overwrite); registerReduceAgg(prefix, withCompanionFunctions, overwrite); registerSetAggAggregate(prefix, withCompanionFunctions, overwrite); registerSetUnionAggregate(prefix, withCompanionFunctions, overwrite); diff --git a/velox/functions/prestosql/aggregates/tests/CMakeLists.txt b/velox/functions/prestosql/aggregates/tests/CMakeLists.txt index a9a76a8f2718..1e01ef4c7df9 100644 --- a/velox/functions/prestosql/aggregates/tests/CMakeLists.txt +++ b/velox/functions/prestosql/aggregates/tests/CMakeLists.txt @@ -42,6 +42,7 @@ add_executable( MinMaxByAggregationTest.cpp MinMaxTest.cpp MultiMapAggTest.cpp + NoisyCountGaussianAggregationTest.cpp NoisyCountIfGaussianAggregationTest.cpp PrestoHasherTest.cpp QDigestAggTest.cpp diff --git a/velox/functions/prestosql/aggregates/tests/NoisyCountGaussianAggregationTest.cpp b/velox/functions/prestosql/aggregates/tests/NoisyCountGaussianAggregationTest.cpp new file mode 100644 index 000000000000..a0c0d0487776 --- /dev/null +++ b/velox/functions/prestosql/aggregates/tests/NoisyCountGaussianAggregationTest.cpp @@ -0,0 +1,243 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "velox/functions/lib/aggregates/tests/utils/AggregationTestBase.h" + +using namespace facebook::velox::exec::test; + +namespace facebook::velox::aggregate::test { +class NoisyCountGaussianAggregationTest + : public functions::aggregate::test::AggregationTestBase { + protected: + void SetUp() override { + AggregationTestBase::SetUp(); + } + RowTypePtr rowType1_{ + ROW({"c0", "c1", "c2"}, {INTEGER(), BOOLEAN(), VARCHAR()})}; + RowTypePtr rowType2_{ + ROW({"c0", "c1", "c2"}, {DOUBLE(), TIMESTAMP(), BIGINT()})}; +}; + +// Test normal count(*) +TEST_F(NoisyCountGaussianAggregationTest, countStarNoNoise) { + auto vectors = makeVectors(rowType1_, 10, 3); + createDuckDbTable(vectors); + + testAggregations( + vectors, + {}, + {"noisy_count_gaussian(1, 0.0)"}, // Normal count(*) should be rewriten + // and call noisy_count_gaussian(1, 0.0) + "SELECT count(*) FROM tmp"); +} + +// Test cases that take different input types without noise. +TEST_F(NoisyCountGaussianAggregationTest, constNoNoise) { + auto vectors = {makeRowVector({ + makeFlatVector(10, [](vector_size_t row) { return row / 3; }), + makeConstant(true, 10), + makeConstant("foo", 10), + })}; + createDuckDbTable(vectors); + + // Aggregate on column c2 which contains string values. + testAggregations( + vectors, + {}, + {"noisy_count_gaussian(c2, 0.0)"}, + "SELECT count(c2) FROM tmp"); + + // Aggregate on column c1 which contains boolean values. + testAggregations( + vectors, + {}, + {"noisy_count_gaussian(c1, 0.0)"}, + "SELECT count(c1) FROM tmp"); + + // Aggregate on column c0 which contains integer values. + testAggregations( + vectors, + {}, + {"noisy_count_gaussian(c0, 0.0)"}, + "SELECT count(c0) FROM tmp"); +} + +// Test cases where the noise scale is invalid. +TEST_F(NoisyCountGaussianAggregationTest, inValidNoise) { + auto vectors = makeVectors(rowType1_, 10, 5); + createDuckDbTable(vectors); + + // Test should fail and output expected error message. + testFailingAggregations( + vectors, + {}, + {"noisy_count_gaussian(c2, -1.0)"}, + "Noise scale must be a non-negative value"); +} + +// Test cases where the input vector has nulls and non-nulls. +TEST_F(NoisyCountGaussianAggregationTest, aggregateMixedNoNoise) { + // Make a single row vector with nulls, null every 4th row (0, 4, 8, ...). + auto vectors = { + makeRowVector({makeFlatVector( + 10, [](auto row) { return row % 3 == 0; }, nullEvery(4))}), + }; + + createDuckDbTable(vectors); + + // Test against the DuckDB result. + testAggregations( + vectors, + {}, + {"noisy_count_gaussian(c0, 0.0)"}, + "SELECT count(c0) FROM tmp"); + + // Test against the expected result, 3 out of 10 rows are null. + auto expectedResult = + makeRowVector({makeConstant(static_cast(7 /*valid rows*/), 1)}); + + testAggregations( + vectors, {}, {"noisy_count_gaussian(c0, 0.0)"}, {expectedResult}); +} + +// Test cases where the input vector has all null values. +TEST_F(NoisyCountGaussianAggregationTest, aggregateAllNullsNoNoise) { + auto vectors = {makeRowVector({makeAllNullFlatVector(10)})}; + + // DuckDB will output 0 in this case while Presto JAVA noisy functions + // will output NULL. + // Test against the expected result, which is NULL. + auto expectedResult = makeRowVector({makeAllNullFlatVector(1)}); + + testAggregations( + vectors, {}, {"noisy_count_gaussian(c0, 0.0)"}, {expectedResult}); +} + +// Test cases where the input vector has nulls with multiple groups, no noise. +TEST_F(NoisyCountGaussianAggregationTest, groupByNullNoNoise) { + // Make a bactch of vectors of which the first contains all nulls, and the + // second contains integers, and the third contains strings. + auto vectors = { + makeRowVector( + {makeAllNullFlatVector(10), + makeFlatVector(10, [](auto row) { return row % 3; }), + makeConstant("foo", 10)}), + }; + + createDuckDbTable(vectors); + + // Test against the DuckDB result. + testAggregations( + vectors, + {"c0", "c1"}, + {"noisy_count_gaussian(c2, 0.0)"}, + "SELECT c0, c1, count(c2) FROM tmp GROUP BY c0, c1"); + + // Test against the expected result. Expected result is: + // c0 c1 noisy_count_gaussian(c2, 0.0) + // NULL 0 4 + // NULL 1 3 + // NULL 2 3 + auto expectedResult = makeRowVector({ + makeAllNullFlatVector(3), // group by c0 is null + makeFlatVector(3, [](auto row) { return row % 3; }), // group by c1 + makeFlatVector( + 3, [](auto row) { return row == 0 ? 4 : 3; }), // noisy count. + }); + + testAggregations( + vectors, + {"c0", "c1"}, + {"noisy_count_gaussian(c2, 0.0)"}, + {expectedResult}); +} + +TEST_F(NoisyCountGaussianAggregationTest, oneAggregateSingleGroupNoNoise) { + // Make two batches of rows: one with nulls; another without. + auto vectors = makeVectors(rowType1_, 10, 2); + + createDuckDbTable(vectors); + + testAggregations( + vectors, + {}, + {"noisy_count_gaussian(c0, 0.0)"}, + "SELECT count(c0) FROM tmp"); +} + +TEST_F(NoisyCountGaussianAggregationTest, oneAggregateMultipleGroupsNoNoise) { + auto vectors = { + // This test case is designed to test the senario where the aggregated + // column has null values for one of the groupby keys. + // c0: 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + // c1: null, 1, null, 1, null, 1, null, 1, null, 1 + // c2: "foo", "foo", ... "foo", "foo" + makeRowVector({ + makeFlatVector(10, [](auto row) { return row % 2; }), // c0 + makeFlatVector( + 10, [](auto row) { return row % 2; }, nullEvery(2)), // c1 + makeFlatVector( + 10, [](auto row) { return std::to_string(row) + "foo"; }) // c2 + }), + }; + + // EXPECTED RESULT: + // C0 | noisy_count_gaussian(c1, 0.0) + // 0 | NULL + // 1 | 5 + auto expectedResult = makeRowVector( + {makeFlatVector({0, 1}), + makeNullableFlatVector({std::nullopt, 5})}); + + testAggregations( + vectors, {"c0"}, {"noisy_count_gaussian(c1, 0.0)"}, {expectedResult}); +} + +TEST_F(NoisyCountGaussianAggregationTest, twoAggregatesSingleGroupNoNoise) { + auto vectors = makeVectors(rowType1_, 10, 4); + createDuckDbTable(vectors); + + testAggregations( + vectors, + {}, + {"noisy_count_gaussian(c1, 0.0)", "noisy_count_gaussian(c2, 0.0)"}, + "SELECT count(c1), count(c2) FROM tmp"); + + // Test with non-zero noise_scale. + auto vectors2 = {makeRowVector({makeConstant(1, 100)})}; // 100 rows of 1 + + // Theoretically, the outcome of noisy_count_gaussian(1, noise_scale) should + // be unpredictable, but with about 95% of the valuesfall within +/- 2 * + // noise_scale, and 99.7% within +/- 3 * noise_scale. We set the noise_scale + // to 1.0, To avoid any failure by chance, we expect the noisy count to be + // within +/- 50 of the actual 100. check if the noisy count is within [50, + // 150]. + + auto result = + AssertQueryBuilder( + PlanBuilder() + .values(vectors2) + .singleAggregation({}, {"noisy_count_gaussian(c0, 1.0)"}, {}) + .planNode(), + duckDbQueryRunner_) + .copyResults(pool()); + ASSERT_TRUE(result->size() == 1); + ASSERT_TRUE(result->childAt(0)->asFlatVector()->valueAt(0) >= 50); + ASSERT_TRUE(result->childAt(0)->asFlatVector()->valueAt(0) <= 150); +} + +} // namespace facebook::velox::aggregate::test diff --git a/velox/functions/prestosql/coverage/README.md b/velox/functions/prestosql/coverage/README.md index 769af4e805d0..76459165c5a6 100644 --- a/velox/functions/prestosql/coverage/README.md +++ b/velox/functions/prestosql/coverage/README.md @@ -15,7 +15,7 @@ to be copy-pasted into velox/docs/functions.rst file. Generates coverage map using all Presto functions. The output to be copy-pasted into velox/docs/functions/presto/coverage.rst file. The functions appear in alphabetical order. -Before generating the coverage map for all Presto functions, please ensure that the data +Before generating the coverage map for all Presto functions, please ensure that the data files at velox/functions/prestosql/coverage/data/ contain all the Presto functions. To generate a list of all Presto functions, please run ```SHOW FUNCTIONS``` in Presto. diff --git a/velox/functions/prestosql/geospatial/GeometryUtils.h b/velox/functions/prestosql/geospatial/GeometryUtils.h index a1538a7f0950..8ec896838f0c 100644 --- a/velox/functions/prestosql/geospatial/GeometryUtils.h +++ b/velox/functions/prestosql/geospatial/GeometryUtils.h @@ -42,6 +42,20 @@ namespace facebook::velox::functions::geospatial { return Status::UserError( \ fmt::format("{}: {}", user_error_message, e.what())); \ } + +/// Utility macro used to wrap GEOS library calls in a try-catch block, +/// throwing a velox::Status with error message if an exception is caught. +#define GEOS_RETHROW(func, user_error_message) \ + try { \ + func \ + } catch (const geos::util::UnsupportedOperationException& e) { \ + VELOX_USER_FAIL(fmt::format("Internal geometry error: {}", e.what())); \ + } catch (const geos::util::AssertionFailedException& e) { \ + VELOX_FAIL(fmt::format("Internal geometry error: {}", e.what())); \ + } catch (const geos::util::GEOSException& e) { \ + VELOX_FAIL(fmt::format("{}: {}", user_error_message, e.what())); \ + } + FOLLY_ALWAYS_INLINE const std::unordered_map& getGeosTypeToStringIdentifier() { diff --git a/velox/functions/prestosql/json/SIMDJsonExtractor.cpp b/velox/functions/prestosql/json/SIMDJsonExtractor.cpp index 0766b1dd0f2a..43ee4c62b0bf 100644 --- a/velox/functions/prestosql/json/SIMDJsonExtractor.cpp +++ b/velox/functions/prestosql/json/SIMDJsonExtractor.cpp @@ -77,7 +77,8 @@ simdjson::error_code extractObject( for (auto field : jsonObj) { SIMDJSON_ASSIGN_OR_RAISE(auto currentKey, field.unescaped_key()); if (currentKey == key) { - ret.emplace(field.value()); + SIMDJSON_ASSIGN_OR_RAISE(auto value, field.value()); + ret.emplace(value); return simdjson::SUCCESS; } } diff --git a/velox/functions/prestosql/json/SIMDJsonExtractor.h b/velox/functions/prestosql/json/SIMDJsonExtractor.h index 42a667ad15fe..c770d357f38e 100644 --- a/velox/functions/prestosql/json/SIMDJsonExtractor.h +++ b/velox/functions/prestosql/json/SIMDJsonExtractor.h @@ -167,7 +167,7 @@ simdjson::error_code SIMDJsonExtractor::extractInternal( if (selector == JsonPathTokenizer::Selector::WILDCARD) { SIMDJSON_ASSIGN_OR_RAISE(auto jsonObj, input.get_object()); for (auto field : jsonObj) { - simdjson::ondemand::value val = field.value(); + SIMDJSON_ASSIGN_OR_RAISE(auto val, field.value()); if (tokenIndex == tokens_.size() - 1) { // Consume each element in the object. SIMDJSON_TRY(consumer(val)); @@ -189,7 +189,8 @@ simdjson::error_code SIMDJsonExtractor::extractInternal( } } else if (input.type() == simdjson::ondemand::json_type::array) { if (selector == JsonPathTokenizer::Selector::WILDCARD) { - for (auto child : input.get_array()) { + SIMDJSON_ASSIGN_OR_RAISE(auto array, input.get_array()); + for (auto child : array) { if (tokenIndex == tokens_.size() - 1) { // Consume each element in the object. SIMDJSON_TRY(consumer(child.value())); @@ -235,7 +236,7 @@ simdjson::error_code SIMDJsonExtractor::visitRecursive( simdjson::padded_string_view paddedJson = reusePaddedStringView(jsonString); simdjson::ondemand::parser localParser; SIMDJSON_ASSIGN_OR_RAISE(auto jsonDoc, localParser.iterate(paddedJson)); - simdjson::ondemand::value jsonDocVal = jsonDoc.get_value(); + SIMDJSON_ASSIGN_OR_RAISE(auto jsonDocVal, jsonDoc.get_value()); // Visit the current node. SIMDJSON_TRY( extractInternal(jsonDocVal, consumer, isDefinitePath, startTokenIdx)); @@ -243,11 +244,11 @@ simdjson::error_code SIMDJsonExtractor::visitRecursive( // Reset the local parser for the next round of iteration where we visit the // children. SIMDJSON_ASSIGN_OR_RAISE(jsonDoc, localParser.iterate(paddedJson)); - jsonDocVal = jsonDoc.get_value(); + SIMDJSON_ASSIGN_OR_RAISE(jsonDocVal, jsonDoc.get_value()); if (jsonDocVal.type() == simdjson::ondemand::json_type::object) { SIMDJSON_ASSIGN_OR_RAISE(auto jsonObj, jsonDocVal.get_object()); for (auto field : jsonObj) { - simdjson::ondemand::value val = field.value(); + SIMDJSON_ASSIGN_OR_RAISE(auto val, field.value()); if (val.type() != simdjson::ondemand::json_type::object && val.type() != simdjson::ondemand::json_type::array) { continue; @@ -256,7 +257,8 @@ simdjson::error_code SIMDJsonExtractor::visitRecursive( visitRecursive(val, consumer, isDefinitePath, startTokenIdx)); } } else if (jsonDocVal.type() == simdjson::ondemand::json_type::array) { - for (auto child : jsonDocVal.get_array()) { + SIMDJSON_ASSIGN_OR_RAISE(auto array, jsonDocVal.get_array()); + for (auto child : array) { simdjson::ondemand::value val = child.value(); if (val.type() != simdjson::ondemand::json_type::object && val.type() != simdjson::ondemand::json_type::array) { diff --git a/velox/functions/prestosql/registration/GeometryFunctionsRegistration.cpp b/velox/functions/prestosql/registration/GeometryFunctionsRegistration.cpp index fd81a92ca00d..338f1fd14ce4 100644 --- a/velox/functions/prestosql/registration/GeometryFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/GeometryFunctionsRegistration.cpp @@ -90,6 +90,10 @@ void registerAccessors(const std::string& prefix) { registerFunction({{prefix + "ST_YMin"}}); registerFunction({{prefix + "ST_XMax"}}); registerFunction({{prefix + "ST_YMax"}}); + registerFunction( + {{prefix + "ST_GeometryType"}}); + registerFunction( + {{prefix + "ST_Distance"}}); } } // namespace diff --git a/velox/functions/prestosql/tests/GeometryFunctionsTest.cpp b/velox/functions/prestosql/tests/GeometryFunctionsTest.cpp index 26d529bcbd7f..da8120168e12 100644 --- a/velox/functions/prestosql/tests/GeometryFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/GeometryFunctionsTest.cpp @@ -1502,3 +1502,139 @@ TEST_F(GeometryFunctionsTest, testSTMax) { assertPointMax(std::nullopt, std::nullopt, std::nullopt); assertPointMax("POLYGON EMPTY", std::nullopt, std::nullopt); } + +TEST_F(GeometryFunctionsTest, testStGeometryType) { + const auto testStGeometryTypeFunc = + [&](const std::optional& wkt, + const std::optional& expected) { + std::optional result = evaluateOnce( + "ST_GeometryType(ST_GeometryFromText(c0))", wkt); + + if (wkt.has_value()) { + ASSERT_TRUE(result.has_value()); + ASSERT_TRUE(expected.has_value()); + ASSERT_EQ(result.value(), expected.value()); + } else { + ASSERT_FALSE(result.has_value()); + } + }; + + testStGeometryTypeFunc("POINT EMPTY", "Point"); + testStGeometryTypeFunc("POINT (3 5)", "Point"); + testStGeometryTypeFunc("LINESTRING EMPTY", "LineString"); + testStGeometryTypeFunc("LINESTRING (1 1, 2 2, 3 3)", "LineString"); + testStGeometryTypeFunc("LINEARRING EMPTY", "LineString"); + testStGeometryTypeFunc("POLYGON EMPTY", "Polygon"); + testStGeometryTypeFunc("POLYGON ((1 1, 4 1, 1 4, 1 1))", "Polygon"); + testStGeometryTypeFunc("MULTIPOINT EMPTY", "MultiPoint"); + testStGeometryTypeFunc("MULTIPOINT (1 2, 2 4, 3 6, 4 8)", "MultiPoint"); + testStGeometryTypeFunc("MULTILINESTRING EMPTY", "MultiLineString"); + testStGeometryTypeFunc( + "MULTILINESTRING ((1 1, 5 1), (2 4, 4 4))", "MultiLineString"); + testStGeometryTypeFunc("MULTIPOLYGON EMPTY", "MultiPolygon"); + testStGeometryTypeFunc( + "MULTIPOLYGON (((1 1, 1 3, 3 3, 3 1, 1 1)), ((2 4, 2 6, 6 6, 6 4, 2 4)))", + "MultiPolygon"); + testStGeometryTypeFunc("GEOMETRYCOLLECTION EMPTY", "GeometryCollection"); + testStGeometryTypeFunc( + "GEOMETRYCOLLECTION (POLYGON ((0 0, 2 0, 2 2, 0 2, 0 0)), POLYGON ((1 1, 3 1, 3 3, 1 3, 1 1)), GEOMETRYCOLLECTION (POINT (8 8), LINESTRING (5 5, 6 6), POLYGON ((1 1, 3 1, 3 4, 1 4, 1 1))))", + "GeometryCollection"); +} + +TEST_F(GeometryFunctionsTest, testStDistance) { + const auto testStDistanceFunc = [&](const std::optional& wkt1, + const std::optional& wkt2, + const std::optional& expected = + std::nullopt) { + std::optional result = evaluateOnce( + "ST_Distance(ST_GeometryFromText(c0), ST_GeometryFromText(c1))", + wkt1, + wkt2); + + if (wkt1.has_value() && wkt2.has_value()) { + if (expected.has_value()) { + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected.value()); + } else { + ASSERT_FALSE(result.has_value()); + } + } else { + ASSERT_FALSE(expected.has_value()); + ASSERT_FALSE(result.has_value()); + } + }; + + testStDistanceFunc("POINT (50 100)", "POINT (150 150)", 111.80339887498948); + testStDistanceFunc("MULTIPOINT (50 100, 50 200)", "POINT (50 100)", 0.0); + testStDistanceFunc( + "LINESTRING (50 100, 50 200)", + "LINESTRING (10 10, 20 20)", + 85.44003745317531); + testStDistanceFunc( + "MULTILINESTRING ((1 1, 5 1), (2 4, 4 4))", + "LINESTRING (10 20, 20 50)'))", + 17.08800749063506); + testStDistanceFunc( + "POLYGON ((1 1, 1 3, 3 3, 3 1, 1 1))", + "POLYGON ((4 4, 4 5, 5 5, 5 4, 4 4))", + 1.4142135623730951); + testStDistanceFunc( + "MULTIPOLYGON (((1 1, 1 3, 3 3, 3 1, 1 1)), ((0 0, 0 2, 2 2, 2 0, 0 0)))", + "POLYGON ((10 100, 30 10, 30 100, 10 100))", + 27.892651361962706); + + testStDistanceFunc("POINT EMPTY", "POINT (150 150)"); + testStDistanceFunc("MULTIPOINT EMPTY", "POINT (50 100)"); + testStDistanceFunc("LINESTRING EMPTY", "LINESTRING (10 10, 20 20)"); + testStDistanceFunc("MULTILINESTRING EMPTY", "LINESTRING (10 20, 20 50)'))"); + testStDistanceFunc("POLYGON EMPTY", "POLYGON ((4 4, 4 5, 5 5, 5 4, 4 4))"); + testStDistanceFunc( + "MULTIPOLYGON EMPTY", "POLYGON ((10 100, 30 10, 30 100, 10 100))"); + testStDistanceFunc(std::nullopt, "POINT (50 100)"); +} + +TEST_F(GeometryFunctionsTest, testStXY) { + const auto testStX = [&](const std::optional& wkt, + const std::optional& expectedX = + std::nullopt) { + std::optional resultX = + evaluateOnce("ST_X(ST_GeometryFromText(c0))", wkt); + + if (expectedX.has_value()) { + ASSERT_TRUE(resultX.has_value()); + ASSERT_EQ(expectedX.value(), resultX.value()); + } else { + ASSERT_FALSE(resultX.has_value()); + } + }; + const auto testStY = [&](const std::optional& wkt, + const std::optional& expectedY = + std::nullopt) { + std::optional resultY = + evaluateOnce("ST_Y(ST_GeometryFromText(c0))", wkt); + + if (expectedY.has_value()) { + ASSERT_TRUE(resultY.has_value()); + ASSERT_EQ(expectedY.value(), resultY.value()); + } else { + ASSERT_FALSE(resultY.has_value()); + } + }; + + testStX("POINT (1 2)", 1.0); + testStY("POINT (1 2)", 2.0); + testStX("POINT EMPTY", std::nullopt); + testStY("POINT EMPTY", std::nullopt); + VELOX_ASSERT_USER_THROW( + testStX("GEOMETRYCOLLECTION EMPTY"), + "ST_X requires a Point geometry, found GeometryCollection"); + VELOX_ASSERT_USER_THROW( + testStY("GEOMETRYCOLLECTION EMPTY"), + "ST_Y requires a Point geometry, found GeometryCollection"); + VELOX_ASSERT_USER_THROW( + testStX("POLYGON ((1 1, 1 3, 3 3, 3 1, 1 1))"), + "ST_X requires a Point geometry, found Polygon"); + VELOX_ASSERT_USER_THROW( + testStY("POLYGON ((1 1, 1 3, 3 3, 3 1, 1 1))"), + "ST_Y requires a Point geometry, found Polygon"); +} diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index bd018163143e..23528708baf7 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -1018,6 +1018,13 @@ TEST_F(JsonFunctionsTest, jsonExtract) { EXPECT_EQ(std::nullopt, jsonExtract("INVALID_JSON", "$")); VELOX_ASSERT_THROW(jsonExtract("{\"\":\"\"}", ""), "Invalid JSON path"); + // This is a special case where the input is only identified as invalid once + // jsonExtract starts to traverse the path. + EXPECT_EQ( + std::nullopt, + jsonExtract( + R"({3436654998315577471:-768009352,3684989847712002091:-317930923,5235625120989803984:1278962211,6359026774420146638:651644866,6614027999037539496:528067092})", + "$.*")); EXPECT_EQ( "[\"0-553-21311-3\",\"0-395-19395-8\"]", diff --git a/velox/functions/prestosql/tests/ProbabilityTest.cpp b/velox/functions/prestosql/tests/ProbabilityTest.cpp index 2b9ef6ce23b1..99684a878385 100644 --- a/velox/functions/prestosql/tests/ProbabilityTest.cpp +++ b/velox/functions/prestosql/tests/ProbabilityTest.cpp @@ -102,6 +102,7 @@ class ProbabilityTest : public functions::test::FunctionBaseTest { 0.0); EXPECT_EQ(binomialCDF(10, 0.1, -2), 0.0); EXPECT_EQ(binomialCDF(25, 0.5, -100), 0.0); + EXPECT_EQ(binomialCDF(2, 0.1, 3), 1.0); // Invalid inputs VELOX_ASSERT_THROW( diff --git a/velox/functions/sparksql/CMakeLists.txt b/velox/functions/sparksql/CMakeLists.txt index f94dade0e618..b331230f67c2 100644 --- a/velox/functions/sparksql/CMakeLists.txt +++ b/velox/functions/sparksql/CMakeLists.txt @@ -17,6 +17,7 @@ velox_add_library( velox_functions_spark_impl ArrayGetFunction.cpp ArraySort.cpp + CharVarcharUtils.cpp Comparisons.cpp ConcatWs.cpp DecimalArithmetic.cpp diff --git a/velox/functions/sparksql/CharVarcharUtils.cpp b/velox/functions/sparksql/CharVarcharUtils.cpp new file mode 100644 index 000000000000..a91e819676e8 --- /dev/null +++ b/velox/functions/sparksql/CharVarcharUtils.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/functions/sparksql/CharVarcharUtils.h" + +namespace facebook::velox::functions::sparksql { + +void trimTrailingSpaces( + exec::StringWriter& output, + StringView inputStr, + int32_t numChars, + uint32_t limit) { + const auto numTailSpacesToTrim = numChars - limit; + VELOX_USER_CHECK_GT(numTailSpacesToTrim, 0); + + auto curPos = inputStr.end() - 1; + const auto trimTo = inputStr.end() - numTailSpacesToTrim; + + while (curPos >= trimTo && stringImpl::isAsciiSpace(*curPos)) { + curPos--; + } + // Get the length of the trimmed string in characters. + const auto trimmedSize = numChars - std::distance(curPos + 1, inputStr.end()); + + VELOX_USER_CHECK_LE( + trimmedSize, limit, "Exceeds allowed length limitation: {}", limit); + output.setNoCopy( + StringView(inputStr.data(), std::distance(inputStr.begin(), curPos + 1))); +} + +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/CharVarcharUtils.h b/velox/functions/sparksql/CharVarcharUtils.h new file mode 100644 index 000000000000..10055736a182 --- /dev/null +++ b/velox/functions/sparksql/CharVarcharUtils.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "velox/expression/StringWriter.h" +#include "velox/functions/lib/string/StringImpl.h" + +namespace facebook::velox::functions::sparksql { + +/// Trims trailing ASCII space characters (0x20) from `'abc'` +/// to ensure its length does not exceed the specified Unicode string length +/// `limit` (must be greater than 0) in characters. Throws an exception if the +/// string still exceeds `limit` after trimming. +void trimTrailingSpaces( + exec::StringWriter& output, + StringView inputStr, + int32_t numChars, + uint32_t limit); + +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 2da694fb9e0a..2049dd64c627 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -580,6 +580,51 @@ struct DateTruncFunction { const tz::TimeZone* timeZone_ = nullptr; }; +/// Truncates a date to a specified time unit. Return NULL if the format is +/// invalid. Format as abbreviated unit string is allowed. +template +struct TruncFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void initialize( + const std::vector& /*inputTypes*/, + const core::QueryConfig& config, + const arg_type* /*date*/, + const arg_type* format) { + if (format != nullptr) { + unit_ = fromDateTimeUnitString( + *format, + /*throwIfInvalid=*/false, + /*allowMicro=*/false, + /*allowAbbreviated=*/true); + } + } + + FOLLY_ALWAYS_INLINE bool call( + out_type& result, + const arg_type& date, + const arg_type& format) { + const auto unitOption = unit_.has_value() ? unit_ + : fromDateTimeUnitString( + format, + /*throwIfInvalid=*/false, + /*allowMicro=*/false, + /*allowAbbreviated=*/true); + // Return NULL if unit is illegal or unit is less than week. + if (!unitOption.has_value() || unitOption.value() < DateTimeUnit::kWeek) { + return false; + } + auto dateTime = getDateTime(date); + adjustDateTime(dateTime, unitOption.value()); + + result = Timestamp::calendarUtcToEpoch(dateTime) / kSecondsInDay; + return true; + } + + private: + std::optional unit_; +}; + template struct DateAddFunction { VELOX_DEFINE_FUNCTION_TYPES(T); diff --git a/velox/functions/sparksql/VarcharTypeWriteSideCheck.h b/velox/functions/sparksql/VarcharTypeWriteSideCheck.h new file mode 100644 index 000000000000..70eadd55a8ad --- /dev/null +++ b/velox/functions/sparksql/VarcharTypeWriteSideCheck.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "velox/functions/Macros.h" +#include "velox/functions/lib/string/StringImpl.h" +#include "velox/functions/sparksql/CharVarcharUtils.h" + +namespace facebook::velox::functions::sparksql { + +/// Ensures that `'abc'` fits within the specified length `n` in +/// characters. If the length of `'abc'` exceeds `n`, trailing spaces +/// are trimmed to fit within `n`. If the length of `'abc'` is less than +/// or equal to `n`, it is returned as-is. Throws an exception if the +/// trimmed string still exceeds `n` or if `n` is negative. This function +/// will trim at most (length of `'abc'` - `n`) space characters (ASCII 32) +/// from the end of `'abc'`. +template +struct VarcharTypeWriteSideCheckFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + // Results refer to strings in the first argument. + static constexpr int32_t reuse_strings_from_arg = 0; + + // ASCII input always produces ASCII result. + static constexpr bool is_default_ascii_behavior = true; + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& input, + int32_t limit) { + doCall(result, input, limit); + } + + FOLLY_ALWAYS_INLINE void callAscii( + out_type& result, + const arg_type& input, + int32_t limit) { + doCall(result, input, limit); + } + + private: + template + FOLLY_ALWAYS_INLINE void doCall( + out_type& result, + const arg_type& input, + int32_t limit) { + VELOX_USER_CHECK_GT(limit, 0, "The length limit must be greater than 0."); + + auto numCharacters = stringImpl::length(input); + if (numCharacters <= limit) { + result.setNoCopy(input); + } else { + trimTrailingSpaces(result, input, numCharacters, limit); + } + } +}; + +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/registration/RegisterDatetime.cpp b/velox/functions/sparksql/registration/RegisterDatetime.cpp index 0625ae202544..52b32032b12f 100644 --- a/velox/functions/sparksql/registration/RegisterDatetime.cpp +++ b/velox/functions/sparksql/registration/RegisterDatetime.cpp @@ -94,6 +94,7 @@ void registerDatetimeFunctions(const std::string& prefix) { {prefix + "timestamp_millis"}); registerFunction( {prefix + "date_trunc"}); + registerFunction({prefix + "trunc"}); } } // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/registration/RegisterString.cpp b/velox/functions/sparksql/registration/RegisterString.cpp index 1ddc231c3375..d3ad800bc912 100644 --- a/velox/functions/sparksql/registration/RegisterString.cpp +++ b/velox/functions/sparksql/registration/RegisterString.cpp @@ -15,6 +15,7 @@ */ #include "velox/expression/SpecialFormRegistry.h" #include "velox/functions/lib/Re2Functions.h" +#include "velox/functions/lib/UpperLower.h" #include "velox/functions/prestosql/StringFunctions.h" #include "velox/functions/prestosql/URLFunctions.h" #include "velox/functions/sparksql/ConcatWs.h" @@ -23,12 +24,11 @@ #include "velox/functions/sparksql/Split.h" #include "velox/functions/sparksql/String.h" #include "velox/functions/sparksql/StringToMap.h" +#include "velox/functions/sparksql/VarcharTypeWriteSideCheck.h" namespace facebook::velox::functions { void registerSparkStringFunctions(const std::string& prefix) { VELOX_REGISTER_VECTOR_FUNCTION(udf_concat, prefix + "concat"); - VELOX_REGISTER_VECTOR_FUNCTION(udf_lower, prefix + "lower"); - VELOX_REGISTER_VECTOR_FUNCTION(udf_upper, prefix + "upper"); VELOX_REGISTER_VECTOR_FUNCTION(udf_reverse, prefix + "reverse"); } @@ -149,6 +149,24 @@ void registerStringFunctions(const std::string& prefix) { ConcatWsCallToSpecialForm::kConcatWs, std::make_unique()); registerFunction({prefix + "luhn_check"}); + + using SparkUpperFunction = + UpperLowerTemplateFunction; + using SparkLowerFunction = + UpperLowerTemplateFunction; + exec::registerVectorFunction( + prefix + "upper", + SparkUpperFunction::signatures(), + std::make_unique()); + exec::registerVectorFunction( + prefix + "lower", + SparkLowerFunction::signatures(), + std::make_unique()); + registerFunction< + VarcharTypeWriteSideCheckFunction, + Varchar, + Varchar, + int32_t>({prefix + "varchar_type_write_side_check"}); } } // namespace sparksql } // namespace facebook::velox::functions diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt index 26dd66af053f..6cc3ef24004f 100644 --- a/velox/functions/sparksql/tests/CMakeLists.txt +++ b/velox/functions/sparksql/tests/CMakeLists.txt @@ -66,7 +66,9 @@ add_executable( StringTest.cpp StringToMapTest.cpp UnscaledValueFunctionTest.cpp + UpperLowerTest.cpp UuidTest.cpp + VarcharTypeWriteSideCheckTest.cpp XxHash64Test.cpp) add_test(velox_functions_spark_test velox_functions_spark_test) diff --git a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp index 80584844bb10..6a097e10700f 100644 --- a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp @@ -1261,5 +1261,31 @@ TEST_F(DateTimeFunctionsTest, dateTrunc) { Timestamp(978336000, 0), dateTrunc("year", Timestamp(998'474'645, 321'001'234))); } + +TEST_F(DateTimeFunctionsTest, trunc) { + const auto trunc = [&](std::optional date, + const std::string& format) { + return evaluateOnce( + fmt::format("trunc(c0, '{}')", format), DATE(), date); + }; + + // Date(0) is 1970-01-01. + EXPECT_EQ(std::nullopt, trunc(0, "")); + EXPECT_EQ(std::nullopt, trunc(0, "day")); + EXPECT_EQ(std::nullopt, trunc(0, "hour")); + EXPECT_EQ(std::nullopt, trunc(0, "minute")); + EXPECT_EQ(std::nullopt, trunc(0, "second")); + EXPECT_EQ(std::nullopt, trunc(0, "millisecond")); + EXPECT_EQ(std::nullopt, trunc(0, "microsecond")); + + // Date(19576) is 2023-08-07, which is Monday, should return Monday. + EXPECT_EQ(19576, trunc(19576, "week")); + // Date(19579) is 2023-08-10, Thur, should return Monday. + EXPECT_EQ(19576, trunc(19579, "week")); + // Date(18297) is 2020-02-05. + EXPECT_EQ(18293, trunc(18297, "month")); + EXPECT_EQ(18262, trunc(18297, "quarter")); + EXPECT_EQ(18262, trunc(18297, "year")); +} } // namespace } // namespace facebook::velox::functions::sparksql::test diff --git a/velox/functions/sparksql/tests/StringTest.cpp b/velox/functions/sparksql/tests/StringTest.cpp index 419488eb5e5e..7f8e0707fd41 100644 --- a/velox/functions/sparksql/tests/StringTest.cpp +++ b/velox/functions/sparksql/tests/StringTest.cpp @@ -1055,5 +1055,6 @@ TEST_F(StringTest, empty2Null) { EXPECT_EQ(empty2Null(""), std::nullopt); EXPECT_EQ(empty2Null("abc"), "abc"); } + } // namespace } // namespace facebook::velox::functions::sparksql::test diff --git a/velox/functions/sparksql/tests/UpperLowerTest.cpp b/velox/functions/sparksql/tests/UpperLowerTest.cpp new file mode 100644 index 000000000000..739bff087fdb --- /dev/null +++ b/velox/functions/sparksql/tests/UpperLowerTest.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h" + +namespace facebook::velox::functions::sparksql::test { +namespace { + +class UpperLowerTest : public SparkFunctionBaseTest { + protected: + std::optional lower(std::optional str) { + return evaluateOnce("lower(c0)", str); + } + + std::optional upper(std::optional str) { + return evaluateOnce("upper(c0)", str); + } +}; + +TEST_F(UpperLowerTest, lowerAscii) { + EXPECT_EQ("abcdefg", lower("ABCDEFG")); + EXPECT_EQ("abcdefg", lower("abcdefg")); + EXPECT_EQ("a b c d e f g", lower("a B c D e F g")); +} + +TEST_F(UpperLowerTest, lowerUnicode) { + EXPECT_EQ( + "àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ", + lower("ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ")); + EXPECT_EQ("αβγδεζηθικλμνξοπρσστυφχψ", lower("ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨ")); + EXPECT_EQ( + "абвгдежзийклмнопрстуфхцчшщъыьэюя", + lower("АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ")); + EXPECT_EQ("i\xCC\x87", lower("\u0130")); + EXPECT_EQ("i\xCC\x87", lower("I\xCC\x87")); + EXPECT_EQ("\u010B", lower("\u010A")); + EXPECT_EQ("\u0117", lower("\u0116")); + EXPECT_EQ("\u0121", lower("\u0120")); + EXPECT_EQ("\u017C", lower("\u017B")); + EXPECT_EQ("\u0227", lower("\u0226")); + EXPECT_EQ("\u022F", lower("\u022E")); + EXPECT_EQ("\u1E03", lower("\u1E02")); + EXPECT_EQ("\u1E0B", lower("\u1E0A")); + EXPECT_EQ("\u1E1F", lower("\u1E1E")); + EXPECT_EQ("\u1E23", lower("\u1E22")); + EXPECT_EQ("\u1E41", lower("\u1E40")); + EXPECT_EQ("\u1E45", lower("\u1E44")); + EXPECT_EQ("\u1E57", lower("\u1E56")); + EXPECT_EQ("\u1E59", lower("\u1E58")); + EXPECT_EQ("\u1E61", lower("\u1E60")); + EXPECT_EQ("\u1E65", lower("\u1E64")); + EXPECT_EQ("\u1E67", lower("\u1E66")); + EXPECT_EQ("\u1E69", lower("\u1E68")); + EXPECT_EQ("\u1E6B", lower("\u1E6A")); + EXPECT_EQ("\u1E87", lower("\u1E86")); + EXPECT_EQ("\u1E8B", lower("\u1E8A")); + EXPECT_EQ("\u1E8F", lower("\u1E8E")); +} + +TEST_F(UpperLowerTest, upperAscii) { + EXPECT_EQ("ABCDEFG", upper("abcdefg")); + EXPECT_EQ("ABCDEFG", upper("ABCDEFG")); + EXPECT_EQ("A B C D E F G", upper("a B c D e F g")); +} + +TEST_F(UpperLowerTest, upperUnicode) { + EXPECT_EQ( + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ", + upper("àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ")); + EXPECT_EQ("ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨ", upper("αβγδεζηθικλμνξοπρςστυφχψ")); + EXPECT_EQ( + "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", + upper("абвгдежзийклмнопрстуфхцчшщъыьэюя")); + EXPECT_EQ("\u0049", upper("\u0069")); + EXPECT_EQ("I\xCC\x87", upper("i\xCC\x87")); + EXPECT_EQ("\u010A", upper("\u010B")); + EXPECT_EQ("\u0116", upper("\u0117")); + EXPECT_EQ("\u0120", upper("\u0121")); + EXPECT_EQ("\u017B", upper("\u017C")); + EXPECT_EQ("\u0226", upper("\u0227")); + EXPECT_EQ("\u022E", upper("\u022F")); + EXPECT_EQ("\u1E02", upper("\u1E03")); + EXPECT_EQ("\u1E0A", upper("\u1E0B")); + EXPECT_EQ("\u1E1E", upper("\u1E1F")); + EXPECT_EQ("\u1E22", upper("\u1E23")); + EXPECT_EQ("\u1E40", upper("\u1E41")); + EXPECT_EQ("\u1E44", upper("\u1E45")); + EXPECT_EQ("\u1E56", upper("\u1E57")); + EXPECT_EQ("\u1E58", upper("\u1E59")); + EXPECT_EQ("\u1E60", upper("\u1E61")); + EXPECT_EQ("\u1E64", upper("\u1E65")); + EXPECT_EQ("\u1E66", upper("\u1E67")); + EXPECT_EQ("\u1E68", upper("\u1E69")); + EXPECT_EQ("\u1E6A", upper("\u1E6B")); + EXPECT_EQ("\u1E86", upper("\u1E87")); + EXPECT_EQ("\u1E8A", upper("\u1E8B")); + EXPECT_EQ("\u1E8E", upper("\u1E8F")); +} + +} // namespace +} // namespace facebook::velox::functions::sparksql::test diff --git a/velox/functions/sparksql/tests/VarcharTypeWriteSideCheckTest.cpp b/velox/functions/sparksql/tests/VarcharTypeWriteSideCheckTest.cpp new file mode 100644 index 000000000000..4d38cd1a374c --- /dev/null +++ b/velox/functions/sparksql/tests/VarcharTypeWriteSideCheckTest.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h" + +namespace facebook::velox::functions::sparksql::test { +namespace { + +class VarcharTypeWriteSideCheckTest : public SparkFunctionBaseTest {}; + +TEST_F(VarcharTypeWriteSideCheckTest, varcharTypeWriteSideCheck) { + const auto varcharTypeWriteSideCheck = + [&](const std::optional& input, + const std::optional& limit) { + return evaluateOnce( + "varchar_type_write_side_check(c0, c1)", input, limit); + }; + + // Basic cases - string length <= limit. + EXPECT_EQ(varcharTypeWriteSideCheck("abc", 3), "abc"); + EXPECT_EQ(varcharTypeWriteSideCheck("ab", 3), "ab"); + EXPECT_EQ(varcharTypeWriteSideCheck("", 5), ""); + + // Cases with trailing spaces. + // Edge cases - input string is longer than limit but trims to exactly limit. + EXPECT_EQ(varcharTypeWriteSideCheck("abc ", 3), "abc"); + EXPECT_EQ(varcharTypeWriteSideCheck("abc ", 4), "abc "); + EXPECT_EQ(varcharTypeWriteSideCheck("abc ", 5), "abc "); + + // Unicode string cases with trailing spaces. + EXPECT_EQ(varcharTypeWriteSideCheck("世界 ", 2), "世界"); + EXPECT_EQ(varcharTypeWriteSideCheck("世界", 2), "世界"); + + // Error cases - string length > limit even after trimming trailing spaces. + VELOX_ASSERT_USER_THROW( + varcharTypeWriteSideCheck("abcd", 3), + "Exceeds allowed length limitation: 3"); + VELOX_ASSERT_USER_THROW( + varcharTypeWriteSideCheck("世界人", 2), + "Exceeds allowed length limitation: 2"); + VELOX_ASSERT_USER_THROW( + varcharTypeWriteSideCheck("abc def", 5), + "Exceeds allowed length limitation: 5"); + + // Null input cases. + EXPECT_EQ(varcharTypeWriteSideCheck(std::nullopt, 5), std::nullopt); + + // Edge cases - length limit must be positive + VELOX_ASSERT_USER_THROW( + varcharTypeWriteSideCheck("abc", 0), + "The length limit must be greater than 0."); + VELOX_ASSERT_USER_THROW( + varcharTypeWriteSideCheck("abc", -1), + "The length limit must be greater than 0."); + + // Edge cases - input string is all spaces. + EXPECT_EQ(varcharTypeWriteSideCheck(" ", 2), " "); + EXPECT_EQ(varcharTypeWriteSideCheck(" ", 3), " "); + EXPECT_EQ(varcharTypeWriteSideCheck(" ", 1), " "); +} + +} // namespace +} // namespace facebook::velox::functions::sparksql::test diff --git a/velox/python/arrow/arrow.pyi b/velox/python/arrow/arrow.pyi index 4302d56cbeaf..aec7a8b1c5f6 100644 --- a/velox/python/arrow/arrow.pyi +++ b/velox/python/arrow/arrow.pyi @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,8 +14,6 @@ # pyre-unsafe -from typing import List - from pyvelox.vector import Vector from pyarrow import Array diff --git a/velox/python/file/file.pyi b/velox/python/file/file.pyi index 0ecf36069a43..33b54c8a3275 100644 --- a/velox/python/file/file.pyi +++ b/velox/python/file/file.pyi @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,7 +16,6 @@ from pyvelox.type import Type - class File: def __init__(self, path: str, format_str: str) -> None: ... def get_schema(self) -> Type: ... diff --git a/velox/python/plan_builder/plan_builder.pyi b/velox/python/plan_builder/plan_builder.pyi index 2461ec9ece4d..83cd5d1d523e 100644 --- a/velox/python/plan_builder/plan_builder.pyi +++ b/velox/python/plan_builder/plan_builder.pyi @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,12 +15,11 @@ # pyre-unsafe from enum import Enum -from typing import List, Dict, Type, Optional +from typing import Optional from pyvelox.file import File from pyvelox.type import Type - class JoinType(Enum): INNER = 1 LEFT = 2 @@ -41,11 +38,11 @@ class PlanBuilder: def table_scan( self, output_schema: Type, - aliases: Dict[str, str] = {}, - subfields: Dict[str, List[int]] = {}, + aliases: dict[str, str] = {}, + subfields: dict[str, list[int]] = {}, row_index: str = "", connector_id: str = "prism", - input_files: List[File] = [], + input_files: list[File] = [], ) -> PlanBuilder: ... def tpch_gen( self, @@ -53,7 +50,7 @@ class PlanBuilder: columns: list[str] = [], scale_factor: int = 1, num_parts: int = 1, - connector_id: str = "tpch" + connector_id: str = "tpch", ) -> PlanBuilder: ... def table_write( self, diff --git a/velox/python/runner/runner.pyi b/velox/python/runner/runner.pyi index b620d6e5ecbe..d511692552bd 100644 --- a/velox/python/runner/runner.pyi +++ b/velox/python/runner/runner.pyi @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +18,6 @@ from typing import Iterator, Optional from pyvelox.vector import Vector - class LocalRunner: def __init__(self, PlanNode) -> None: ... def execute(self, max_drivers: Optional[int] = None) -> Iterator[Vector]: ... diff --git a/velox/python/type/type.pyi b/velox/python/type/type.pyi index a807c3a391e4..6d7f95fccd38 100644 --- a/velox/python/type/type.pyi +++ b/velox/python/type/type.pyi @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,7 +16,6 @@ from typing import List - class Type: ... def BIGINT() -> Type: ... diff --git a/velox/python/vector/vector.pyi b/velox/python/vector/vector.pyi index 482837adf49e..3504a1dae4a2 100644 --- a/velox/python/vector/vector.pyi +++ b/velox/python/vector/vector.pyi @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Copyright (c) Facebook, Inc. and its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,9 +14,6 @@ # pyre-unsafe -from typing import List - - class Vector: def size(self) -> int: ... def print_all(self) -> str: ... diff --git a/velox/substrait/tests/data/q1_first_stage.json b/velox/substrait/tests/data/q1_first_stage.json index 1b9ba06231d6..6b4c1cf7a499 100644 --- a/velox/substrait/tests/data/q1_first_stage.json +++ b/velox/substrait/tests/data/q1_first_stage.json @@ -874,4 +874,4 @@ } ], "expected_type_urls": [] -} \ No newline at end of file +} diff --git a/velox/substrait/tests/data/q6_first_stage.json b/velox/substrait/tests/data/q6_first_stage.json index b6c2f535df84..36597a362516 100644 --- a/velox/substrait/tests/data/q6_first_stage.json +++ b/velox/substrait/tests/data/q6_first_stage.json @@ -600,4 +600,4 @@ } } ] -} \ No newline at end of file +} diff --git a/velox/tool/trace/CMakeLists.txt b/velox/tool/trace/CMakeLists.txt index ba147387ff86..1907e96cbc67 100644 --- a/velox/tool/trace/CMakeLists.txt +++ b/velox/tool/trace/CMakeLists.txt @@ -23,7 +23,8 @@ add_library( TableScanReplayer.cpp TableWriterReplayer.cpp TraceReplayRunner.cpp - TraceReplayTaskRunner.cpp) + TraceReplayTaskRunner.cpp + UnnestReplayer.cpp) target_link_libraries( velox_query_trace_replayer_base diff --git a/velox/tool/trace/TraceReplayRunner.cpp b/velox/tool/trace/TraceReplayRunner.cpp index 383b803b4fbd..ddeb54a52498 100644 --- a/velox/tool/trace/TraceReplayRunner.cpp +++ b/velox/tool/trace/TraceReplayRunner.cpp @@ -49,6 +49,7 @@ #include "velox/tool/trace/PartitionedOutputReplayer.h" #include "velox/tool/trace/TableScanReplayer.h" #include "velox/tool/trace/TableWriterReplayer.h" +#include "velox/tool/trace/UnnestReplayer.h" #include "velox/type/Type.h" #ifdef VELOX_ENABLE_PARQUET @@ -410,6 +411,16 @@ TraceReplayRunner::createReplayer() const { FLAGS_driver_ids, queryCapacityBytes, cpuExecutor_.get()); + } else if (traceNodeName == "Unnest") { + replayer = std::make_unique( + FLAGS_root_dir, + FLAGS_query_id, + FLAGS_task_id, + FLAGS_node_id, + traceNodeName, + FLAGS_driver_ids, + queryCapacityBytes, + cpuExecutor_.get()); } else { VELOX_UNSUPPORTED("Unsupported operator type: {}", traceNodeName); } diff --git a/velox/tool/trace/UnnestReplayer.cpp b/velox/tool/trace/UnnestReplayer.cpp new file mode 100644 index 000000000000..c072181eb302 --- /dev/null +++ b/velox/tool/trace/UnnestReplayer.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/tool/trace/UnnestReplayer.h" + +#include "velox/exec/TraceUtil.h" +#include "velox/exec/tests/utils/PlanBuilder.h" + +using namespace facebook::velox; +using namespace facebook::velox::exec; +using namespace facebook::velox::exec::test; + +namespace facebook::velox::tool::trace { +core::PlanNodePtr UnnestReplayer::createPlanNode( + const core::PlanNode* node, + const core::PlanNodeId& nodeId, + const core::PlanNodePtr& source) const { + const auto* unnestNode = dynamic_cast(node); + VELOX_CHECK_NOT_NULL(unnestNode); + return std::make_shared( + nodeId, + unnestNode->replicateVariables(), + unnestNode->unnestVariables(), + unnestNode->unnestNames(), + unnestNode->ordinalityName(), + source); +} +} // namespace facebook::velox::tool::trace diff --git a/velox/tool/trace/UnnestReplayer.h b/velox/tool/trace/UnnestReplayer.h new file mode 100644 index 000000000000..87ab6c69ea3b --- /dev/null +++ b/velox/tool/trace/UnnestReplayer.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/core/PlanNode.h" +#include "velox/tool/trace/OperatorReplayerBase.h" + +namespace facebook::velox::tool::trace { +/// The replayer to replay the traced 'IndexLookupJoin' operator. +class UnnestReplayer : public OperatorReplayerBase { + public: + UnnestReplayer( + const std::string& traceDir, + const std::string& queryId, + const std::string& taskId, + const std::string& nodeId, + const std::string& operatorType, + const std::string& driverIds, + uint64_t queryCapacity, + folly::Executor* executor) + : OperatorReplayerBase( + traceDir, + queryId, + taskId, + nodeId, + operatorType, + driverIds, + queryCapacity, + executor) {} + + private: + core::PlanNodePtr createPlanNode( + const core::PlanNode* node, + const core::PlanNodeId& nodeId, + const core::PlanNodePtr& source) const override; +}; +} // namespace facebook::velox::tool::trace diff --git a/velox/tool/trace/tests/CMakeLists.txt b/velox/tool/trace/tests/CMakeLists.txt index c7d6716613e9..332f5bbb8cdb 100644 --- a/velox/tool/trace/tests/CMakeLists.txt +++ b/velox/tool/trace/tests/CMakeLists.txt @@ -21,7 +21,8 @@ add_executable( PartitionedOutputReplayerTest.cpp TraceFileToolTest.cpp TableScanReplayerTest.cpp - TableWriterReplayerTest.cpp) + TableWriterReplayerTest.cpp + UnnestReplayerTest.cpp) add_test( NAME velox_tool_trace_test diff --git a/velox/tool/trace/tests/UnnestReplayerTest.cpp b/velox/tool/trace/tests/UnnestReplayerTest.cpp new file mode 100644 index 000000000000..6e1e4b476459 --- /dev/null +++ b/velox/tool/trace/tests/UnnestReplayerTest.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include "velox/common/file/FileSystems.h" +#include "velox/exec/PartitionFunction.h" +#include "velox/exec/TraceUtil.h" +#include "velox/exec/tests/utils/AssertQueryBuilder.h" +#include "velox/exec/tests/utils/HiveConnectorTestBase.h" +#include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/exec/tests/utils/TempDirectoryPath.h" +#include "velox/serializers/PrestoSerializer.h" +#include "velox/tool/trace/TraceReplayRunner.h" +#include "velox/tool/trace/UnnestReplayer.h" + +using namespace facebook::velox; +using namespace facebook::velox::core; +using namespace facebook::velox::exec; +using namespace facebook::velox::exec::test; +using namespace facebook::velox::connector; +using namespace facebook::velox::connector::hive; + +namespace facebook::velox::tool::trace::test { +class UnnestReplayerTest : public HiveConnectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{}); + HiveConnectorTestBase::SetUpTestCase(); + filesystems::registerLocalFileSystem(); + if (!isRegisteredVectorSerde()) { + serializer::presto::PrestoVectorSerde::registerVectorSerde(); + } + Type::registerSerDe(); + common::Filter::registerSerDe(); + connector::hive::HiveTableHandle::registerSerDe(); + connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveColumnHandle::registerSerDe(); + connector::hive::HiveInsertTableHandle::registerSerDe(); + connector::hive::HiveInsertFileNameGenerator::registerSerDe(); + connector::hive::HiveConnectorSplit::registerSerDe(); + core::PlanNode::registerSerDe(); + velox::exec::trace::registerDummySourceSerDe(); + core::ITypedExpr::registerSerDe(); + registerPartitionFunctionSerDe(); + } + + void TearDown() override { + input_.clear(); + HiveConnectorTestBase::TearDown(); + } + + std::vector + makeVectors(int32_t count, int32_t rowsPerVector, const RowTypePtr& rowType) { + return HiveConnectorTestBase::makeVectors(rowType, count, rowsPerVector); + } + + std::vector makeSplits( + const std::vector& inputs, + const std::string& path) { + std::vector splits; + for (auto i = 0; i < 4; ++i) { + const std::string filePath = fmt::format("{}/{}", path, i); + writeToFile(filePath, inputs); + splits.emplace_back(makeHiveConnectorSplit(filePath)); + } + + return splits; + } + + struct PlanWithSplits { + core::PlanNodePtr plan; + core::PlanNodeId scanId; + std::vector splits; + + explicit PlanWithSplits( + const core::PlanNodePtr& _plan, + const core::PlanNodeId& _scanId = "", + const std::vector& _splits = {}) + : plan(_plan), scanId(_scanId), splits(_splits) {} + }; + + PlanWithSplits createPlan() { + auto planNodeIdGenerator = std::make_shared(); + core::PlanNodeId scanId; + core::PlanNodeId unnestId; + + auto plan = PlanBuilder(planNodeIdGenerator) + .tableScan(inputType_) + .capturePlanNodeId(scanId) + .unnest({"c0"}, {"c1"}) + .capturePlanNodeId(unnestId_) + .planNode(); + + const std::vector splits = + makeSplits(input_, fmt::format("{}/splits", testDir_->getPath())); + return PlanWithSplits{plan, scanId, splits}; + } + + core::PlanNodeId unnestId_; + RowTypePtr inputType_{ROW({"c0", "c1"}, {BIGINT(), ARRAY(INTEGER())})}; + + std::vector input_ = makeVectors(5, 100, inputType_); + + const std::shared_ptr testDir_ = + TempDirectoryPath::create(); +}; + +TEST_F(UnnestReplayerTest, test) { + // Create input data with arrays + auto arrayVector = makeArrayVector( + 100, + [](auto row) { return row % 5 + 1; }, + [](auto row, auto index) { return index * (row % 3); }); + + input_ = {makeRowVector( + {makeFlatVector(100, [](auto row) { return row; }), + arrayVector})}; + + // Run the original query and get results + const auto planWithSplits = createPlan(); + AssertQueryBuilder builder(planWithSplits.plan); + const auto result = builder.splits(planWithSplits.splits).copyResults(pool()); + + // Run the query with tracing enabled + const auto traceRoot = + fmt::format("{}/{}/traceRoot/", testDir_->getPath(), "basic"); + std::shared_ptr task; + auto tracePlanWithSplits = createPlan(); + AssertQueryBuilder traceBuilder(tracePlanWithSplits.plan); + traceBuilder.maxDrivers(4) + .config(core::QueryConfig::kQueryTraceEnabled, true) + .config(core::QueryConfig::kQueryTraceDir, traceRoot) + .config(core::QueryConfig::kQueryTraceMaxBytes, 100UL << 30) + .config(core::QueryConfig::kQueryTraceTaskRegExp, ".*") + .config(core::QueryConfig::kQueryTraceNodeId, unnestId_); + auto traceResult = + traceBuilder.splits(tracePlanWithSplits.splits).copyResults(pool(), task); + + // Verify that the traced results match the original results + assertEqualResults({result}, {traceResult}); + + // Replay the traced execution and verify results + const auto replayingResult = UnnestReplayer( + traceRoot, + task->queryCtx()->queryId(), + task->taskId(), + unnestId_, + "Unnest", + "", + 0, + executor_.get()) + .run(); + assertEqualResults({result}, {replayingResult}); +} + +} // namespace facebook::velox::tool::trace::test diff --git a/website/README.md b/website/README.md index 9ebdfccf492c..09465bc0eb9e 100644 --- a/website/README.md +++ b/website/README.md @@ -31,4 +31,4 @@ Velox's website is automatically deployed using the files under *velox/website* is submitted, a live preview link is generated by Netlify. The link is posted in the pull request as a comment by the Netlify bot. When the pull request is merged, the changes are automatically deployed to -the website by Netlify. +the website by Netlify. diff --git a/website/blog/2023-03-15-build-experience.mdx b/website/blog/2023-03-15-build-experience.mdx index 164c893265e9..d5ab05290cdd 100644 --- a/website/blog/2023-03-15-build-experience.mdx +++ b/website/blog/2023-03-15-build-experience.mdx @@ -6,13 +6,13 @@ tags: [tech-blog, packaging] --- -When Velox was open sourced in August 2021, it was not nearly as easily usable and portable as it is today. In order for Velox to become the unified execution engine blurring the boundaries for data analytics and ML, we needed Velox to be easy to build and package on multiple platforms, and support a wide range of hardware architectures. If we are supporting all these platforms, we also need to ensure that Velox remains fast and regressions are caught early. +When Velox was open sourced in August 2021, it was not nearly as easily usable and portable as it is today. In order for Velox to become the unified execution engine blurring the boundaries for data analytics and ML, we needed Velox to be easy to build and package on multiple platforms, and support a wide range of hardware architectures. If we are supporting all these platforms, we also need to ensure that Velox remains fast and regressions are caught early. To improve the Velox experience for users and community developers, Velox has partnered with Voltron Data to help make Velox more accessible and user-friendly. In this blog post, we will examine the challenges we faced, the improvements that have already been made, and the ones yet to come. ## Enhancements & Improvements -Velox was a product of the mono repo and required installation of dependencies on the system via a script. Any change in the state of the host system could cause a build failure and introduce version conflicts of dependencies. Fixing these challenges was a big focus to help the Velox Community and we worked in collaboration with the Voltron Data Team. We wanted to improve the overall Velox user experience by making Velox easy to consume across a wide range of platforms to accelerate its adoption. +Velox was a product of the mono repo and required installation of dependencies on the system via a script. Any change in the state of the host system could cause a build failure and introduce version conflicts of dependencies. Fixing these challenges was a big focus to help the Velox Community and we worked in collaboration with the Voltron Data Team. We wanted to improve the overall Velox user experience by making Velox easy to consume across a wide range of platforms to accelerate its adoption. We choose hermetic builds as a solution to the aforementioned problems, as they provide a number of benefits. Hermetic builds[^1] improve reproducibility by providing isolation from the state of the host machine and produce the same result for any given commit in the Velox repository. This requires precise dependency management. diff --git a/website/blog/2024-05-31-optimize-try-more.mdx b/website/blog/2024-05-31-optimize-try-more.mdx index dc4b6edc5ce3..d72f6aadb295 100644 --- a/website/blog/2024-05-31-optimize-try-more.mdx +++ b/website/blog/2024-05-31-optimize-try-more.mdx @@ -352,4 +352,4 @@ Thank you Bikramjeet VigOrri Erling, Pedro Eugenio Rocha Pedreira and Xiaoxuan Meng for brainstorming and -helping with code reviews. \ No newline at end of file +helping with code reviews. diff --git a/website/blog/2024-08-23-ci-migration.mdx b/website/blog/2024-08-23-ci-migration.mdx index 398c17279b60..f93a0d335da1 100644 --- a/website/blog/2024-08-23-ci-migration.mdx +++ b/website/blog/2024-08-23-ci-migration.mdx @@ -22,7 +22,7 @@ When a pull request is submitted to Velox, the following jobs are executed: 1. Linting and Formatting workflows: 1. Header checks 2. License checks - 3. Basic Linters + 3. Basic Linters 2. Ensure Velox builds on various platforms 1. MacOS (Intel, M1) 2. Linux (Ubuntu/Centos) @@ -31,11 +31,11 @@ When a pull request is submitted to Velox, the following jobs are executed: 2. Build default Velox build 3. Build Velox with support for Parquet, Arrow and External Adapters (S3/HDFS/GCS etc.) 4. PyVelox builds -4. Run prerequisite tests +4. Run prerequisite tests 1. Unit Tests 2. Benchmarking Tests 1. [Conbench](https://velox-conbench.voltrondata.run/runs/5bd139fffa9b4e0eb020da4d63211121/) is used to store and compare results, and also alert users on regressions - 3. Various Fuzzer Tests (Expression / Aggregation/ Exchange / Join etc) + 3. Various Fuzzer Tests (Expression / Aggregation/ Exchange / Join etc) 4. Signature Check and Biased Fuzzer Tests ( Expression / Aggregation) 5. Fuzzer Tests using Presto as source of truth 5. Docker Image build jobs @@ -49,7 +49,7 @@ When a pull request is submitted to Velox, the following jobs are executed: ## Velox CI Optimization -Previous implementation of CI in CircleCI grew organically and was unoptimized, resulting in long build times, and also significantly costlier. This opportunity to migrate to Github Actions helped to take a holistic view of CI deployments and actively optimized to reduce build times and CI spend. Note however, that there has been continued investment in reducing test times to further improve Velox reliability, stability and developer experience. Some of the optimizations completed are: +Previous implementation of CI in CircleCI grew organically and was unoptimized, resulting in long build times, and also significantly costlier. This opportunity to migrate to Github Actions helped to take a holistic view of CI deployments and actively optimized to reduce build times and CI spend. Note however, that there has been continued investment in reducing test times to further improve Velox reliability, stability and developer experience. Some of the optimizations completed are: 1. **Persisting build artifacts across builds**: During every build, the object files and binaries produced are cached. In addition to this, artifacts such as scalar function signatures and aggregate function signatures are produced. These signatures are used to compare with the baseline version, by comparing against the changes in the current PR to determine if the current changes are backwards incompatible or bias the newly added changes. Using a stash to persist these artifacts helps save one build cycle. @@ -57,11 +57,11 @@ Previous implementation of CI in CircleCI grew organically and was unoptimized, ## Instrumenting Velox CI Builds -Velox CI builds were instrumented in Conbench so that it can capture various metrics about the builds: +Velox CI builds were instrumented in Conbench so that it can capture various metrics about the builds: 1. Build times at translation unit / library/ project level. 2. Binary sizes produced at TLU/ .a,.so / executable level. -3. Memory pressure -4. Measure across time how our changes affect binary sizes +3. Memory pressure +4. Measure across time how our changes affect binary sizes A nightly job is run to capture these build metrics and it is uploaded to Conbench. Velox build metrics report is available here: [Velox Build Metrics Report](https://facebookincubator.github.io/velox/bm-report/) @@ -69,10 +69,8 @@ A nightly job is run to capture these build metrics and it is uploaded to Conben ## Acknowledgements -A large part of the credit goes to Jacob Wujciak and the team at Voltron Data. We would also like to thank other collaborators in the Open Source Community and at Meta, including but not limited to: +A large part of the credit goes to Jacob Wujciak and the team at Voltron Data. We would also like to thank other collaborators in the Open Source Community and at Meta, including but not limited to: **Meta**: Sridhar Anumandla, Pedro Eugenio Rocha Pedreira, Deepak Majeti, Meta OSS Team, and others **Voltron Data**: Jacob Wujciak, Austin Dickey, Marcus Hanwell, Sri Nadukudy, and others - - diff --git a/website/blog/2025-03-25-velox-primer-part-2.mdx b/website/blog/2025-03-25-velox-primer-part-2.mdx index 48c793509cf2..27407953695e 100644 --- a/website/blog/2025-03-25-velox-primer-part-2.mdx +++ b/website/blog/2025-03-25-velox-primer-part-2.mdx @@ -33,9 +33,9 @@ distributed query plan with three query fragments will be created: divides its output according to a hash of *l_partkey*. 2. The second fragment reads the output from the first fragment and updates a hash table from *l_partkey* containing the number of times the particular value -of *l_partkey* has been seen (the count(*) aggregate function implementation). +of *l_partkey* has been seen (the count(*) aggregate function implementation). 3. The final fragment then reads the content of the hash tables, once the -second fragment has received all the rows from the first fragment. +second fragment has received all the rows from the first fragment.
@@ -71,7 +71,7 @@ In Prestissimo, the message that sets up a Task in a worker is called *Task Update*. A Task Update has the following information: the plan, configuration settings, and an optional list of splits. Splits are further qualified by what plan node they are intended for, and whether more splits for the recipient plan -node and split group will be coming. +node and split group will be coming. Since split generation involves enumerating files from storage (so they may take a while), Presto allows splits to be sent to workers asynchronously, such @@ -109,13 +109,13 @@ operator tree where each node consumes the output of its child operators, and returns output to the parent operator. The root node is typically a PartitionedOutputNode or a TableWriteNode. The leaf nodes are either TableScanNode, ExchangeNode or ValuesNode (used for query literals). The full -set of Velox PlanNode can be found at velox/core/PlanNode.h. +set of Velox PlanNode can be found at velox/core/PlanNode.h. The PlanNodes mostly correspond to Operators. PlanNodes are not executable as such; they are only a structure describing how to make Drivers and Operators, which do the actual execution. If the tree of nodes has a single branch, then the plan is a single pipeline. If it has nodes with more than one child -(input), then the second input of the node becomes a separate pipeline. +(input), then the second input of the node becomes a separate pipeline. `Task::start()` creates the DriverFactories, which then create the Drivers. To start execution, the Drivers are queued on a thread pool executor. The main @@ -151,7 +151,7 @@ that no more splits will be coming. In this case TableScan would be at the end. Finally, if a Split is available, TableScan interprets it. Given a TableHandle specification provided as part of the plan (list of columns and filters), the Connector (as specified in the Split) makes a DataSource. The DataSource -handles the details of IO and file and table formats. +handles the details of IO and file and table formats. The DataSource is then given the split. After this, DataSource::next() can be called repeatedly to get vectors (batches) of output from the file/section of diff --git a/website/blog/2025-05-12-velox-primer-part-3.mdx b/website/blog/2025-05-12-velox-primer-part-3.mdx index b09ea8beb342..424c6736cc94 100644 --- a/website/blog/2025-05-12-velox-primer-part-3.mdx +++ b/website/blog/2025-05-12-velox-primer-part-3.mdx @@ -19,7 +19,7 @@ stage of the query is executed, from table scan to partitioned output - or the producer side of the shuffle.   In this article, we will discuss the second query stage, or the consumer side -of the shuffle. +of the shuffle. ## Shuffle Consumer diff --git a/website/blog/authors.yml b/website/blog/authors.yml index 016a74747ee1..cffe9df50581 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -40,7 +40,7 @@ kgpai: name: Krishna Pai title: Software Engineer @ Meta url: https://github.com/kgpai - image_url: https://github.com/kgpai.png + image_url: https://github.com/kgpai.png jwujciak: name: Jacob Wujciak-Jens diff --git a/website/docs/community/01-design-philosophy.md b/website/docs/community/01-design-philosophy.md index 83902c8606b1..8220f890ccc5 100644 --- a/website/docs/community/01-design-philosophy.md +++ b/website/docs/community/01-design-philosophy.md @@ -8,7 +8,7 @@ title: Design Philosophy This page lists a set of directional principles and values meant to guide contributors and maintainers as they develop the Velox project. These are not meant to be hard-and-fast rules, but to inform decision making and help guide -discussions that may come up during the development of Velox. +discussions that may come up during the development of Velox. ## Velox Mission @@ -87,4 +87,3 @@ mission, a few overarching principles and values are highlighted below. * **Adaptivity.** Exposing too many configuration knobs to users increases the API complexity, and makes it more error-prone. As much as possible, we try to make the library self-adapt to find optimal execution configurations. - diff --git a/website/docs/community/02-technical-governance.md b/website/docs/community/02-technical-governance.md index b591ba723ef9..a959b03c4e05 100644 --- a/website/docs/community/02-technical-governance.md +++ b/website/docs/community/02-technical-governance.md @@ -213,7 +213,7 @@ opened for discussion. Components maintainers are responsible for participating in the discussion, reviewing, providing feedback, and eventually approving the changes. If disputes are made over the change, component maintainers are responsible for settling the issue, or ultimately escalating to the PLC in case -they cannot reach consensus. +they cannot reach consensus. ### Re-Scope Project Components @@ -236,7 +236,7 @@ project. Except as described below, all code contributions to the project must be made using the Apache 2.0 License available here: [https://www.apache.org/licenses/LICENSE-2.0 -](https://www.apache.org/licenses/LICENSE-2.0) (the "Project License"). +](https://www.apache.org/licenses/LICENSE-2.0) (the "Project License"). All outbound code will be made available under the Project License. The maintainers may approve the use of an alternative open license or licenses for @@ -249,7 +249,7 @@ start?** The Velox [contributing guide](https://github.com/facebookincubator/velox/blob/main/CONTRIBUTING.md) provides guidelines on how new community members can get involved in the -project. +project. **Q: Is it possible for an external contributor to become a maintainer and be granted responsibilities over parts of the codebase?** @@ -266,7 +266,7 @@ individuals, and membership in these groups is based on merit in the community. **Q: How do I contribute code to the project?** If the change is relatively minor, a pull request on GitHub can be opened up immediately for review by the project maintainers. For larger changes, please -open an Issue to make a proposal to discuss prior. Please also see the +open an Issue to make a proposal to discuss prior. Please also see the [Velox Contributor Guide](https://github.com/facebookincubator/velox/blob/main/CONTRIBUTING.md) for contribution guidelines. diff --git a/website/docs/community/03-components-and-maintainers.md b/website/docs/community/03-components-and-maintainers.md index a4eaf1a4f3c7..2983c33e8f0b 100644 --- a/website/docs/community/03-components-and-maintainers.md +++ b/website/docs/community/03-components-and-maintainers.md @@ -14,7 +14,7 @@ and codebase. Maintainership status is *lagging*, not *leading*, and it is only acquired through active participation and demonstration of skills and domain-specific knowledge. All individuals listed in this page are expected to uphold -[Velox’s mission, design philosophy, and principles](./design-philosophy). +[Velox’s mission, design philosophy, and principles](./design-philosophy). ## Project Leadership Council - PLC @@ -35,7 +35,7 @@ developer documentation. Before working on a new feature or optimization, please review our [CONTRIBUTING.md](https://github.com/facebookincubator/velox/blob/main/CONTRIBUTING.md) guide and initiate a discussion on Github with the people listed as -maintainers of that component. +maintainers of that component. ### Vectors, Types, Arrow Bindings: diff --git a/website/docs/community/index.md b/website/docs/community/index.md index 79ccf9525497..6d5c82ed6535 100644 --- a/website/docs/community/index.md +++ b/website/docs/community/index.md @@ -4,8 +4,8 @@ slug: /community/ # Community -Velox is a project created and -[open sourced by Meta in 2023](https://engineering.fb.com/2023/03/09/open-source/velox-open-source-execution-engine/). +Velox is a project created and +[open sourced by Meta in 2023](https://engineering.fb.com/2023/03/09/open-source/velox-open-source-execution-engine/). Today, Velox is developed and maintained by a community of 200+ individuals from 20+ different organizations. This page contains more information about Velox's open source community. diff --git a/website/src/components/HomepageFeatures/index.js b/website/src/components/HomepageFeatures/index.js index a3b6ea574b07..beb1752ef728 100644 --- a/website/src/components/HomepageFeatures/index.js +++ b/website/src/components/HomepageFeatures/index.js @@ -27,7 +27,7 @@ export default function HomepageFeatures() { ))}
- +
diff --git a/website/src/components/VeloxConBanner/index.js b/website/src/components/VeloxConBanner/index.js index 76a892a924ac..78dce089f33a 100644 --- a/website/src/components/VeloxConBanner/index.js +++ b/website/src/components/VeloxConBanner/index.js @@ -25,4 +25,4 @@ export default function VeloxConBanner() { ); -} \ No newline at end of file +} diff --git a/website/src/pages/index.js b/website/src/pages/index.js index 448d42629738..c97d5d25c6bd 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -128,7 +128,7 @@ function KeyFeatures() { Reusability Icon

Reusability

- Features and runtime optimizations available in Velox are developed and maintained once, reducing engineering duplication and promoting reusability. + Features and runtime optimizations available in Velox are developed and maintained once, reducing engineering duplication and promoting reusability.

diff --git a/website/static/img/banner-pattern.svg b/website/static/img/banner-pattern.svg index 44c0b038939f..69a146553a25 100644 --- a/website/static/img/banner-pattern.svg +++ b/website/static/img/banner-pattern.svg @@ -100,4 +100,4 @@ - \ No newline at end of file + diff --git a/website/static/img/icon-commits.svg b/website/static/img/icon-commits.svg index f027389614f2..7ec9b630e449 100644 --- a/website/static/img/icon-commits.svg +++ b/website/static/img/icon-commits.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/website/static/img/icon-contributors.svg b/website/static/img/icon-contributors.svg index 1e08857ef96b..68eccc8d0b6e 100644 --- a/website/static/img/icon-contributors.svg +++ b/website/static/img/icon-contributors.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/website/static/img/icon-github-star.svg b/website/static/img/icon-github-star.svg index 841dd97b3176..28297614d124 100644 --- a/website/static/img/icon-github-star.svg +++ b/website/static/img/icon-github-star.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/website/static/img/logo.svg b/website/static/img/logo.svg index d3e7a794ddc1..8ceed2d92a5d 100644 --- a/website/static/img/logo.svg +++ b/website/static/img/logo.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/website/static/img/velox-logo.svg b/website/static/img/velox-logo.svg index 67627d4cfd2b..0db7eec5f1f5 100644 --- a/website/static/img/velox-logo.svg +++ b/website/static/img/velox-logo.svg @@ -1 +1 @@ - \ No newline at end of file +