Skip to content

Commit 7806294

Browse files
committed
Improve small sort performance
Use optimal in-place sorting for ranges of size 5 and smaller. This first minimizes maximum and average number of comparisons (these two goals never conflict at this size), then minimizes comparisons for already sorted inputs, then minimizes comparisons for reverse sorted inputs. With this new code, there is not an advantage to the hybrid approach I made for sorting 6 elements, so that is currently disabled. There is still work to be done here to optimize small ranges that are larger than the hand-coded algorithms. For ranges that are large, have non-trivial element types, or use a comparison function other than `std::less` and `std::greater`, assume comparisons are expensive and use an algorithm that minimizes comparisons, otherwise use an algorithm that performs best for types like `int`. These small sorting algorithms perform much better than existing standard library implementations for all types tested.
1 parent 1487546 commit 7806294

17 files changed

+918
-330
lines changed

source/containers/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ target_sources(containers PUBLIC
1515
algorithms/sort/dereference_all.cpp
1616
algorithms/sort/double_buffered_ska_sort.cpp
1717
algorithms/sort/fixed_size_merge_sort.cpp
18+
algorithms/sort/low_high_ref.cpp
1819
algorithms/sort/inplace_radix_sort.cpp
1920
algorithms/sort/is_sorted.cpp
2021
algorithms/sort/merge_relocate_second_range.cpp
@@ -29,6 +30,7 @@ target_sources(containers PUBLIC
2930
algorithms/sort/sort_exactly_4.cpp
3031
algorithms/sort/sort_exactly_5.cpp
3132
algorithms/sort/sort_exactly_6.cpp
33+
algorithms/sort/sort_exactly_n.cpp
3234
algorithms/sort/to_radix_sort_key.cpp
3335
algorithms/accumulate.cpp
3436
algorithms/adjacent.cpp

source/containers/algorithms/sort/cheaply_sortable.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@ namespace containers {
1414

1515
// Size determined experimentally, but also the size of a cache line. Maybe it
1616
// should be `std::hardware_destructive_interference_size`?
17-
export template<typename T>
18-
concept cheaply_sortable = std::is_trivially_copyable_v<T> and sizeof(T) <= 64_bi;
17+
// This incorrectly considers `std::string_view` cheaply_sortable. This ideally
18+
// would have something like `has_default_compare`.
19+
export template<typename T, typename Compare>
20+
concept cheaply_sortable =
21+
std::is_trivially_copyable_v<T> and
22+
sizeof(T) <= 64_bi and
23+
(std::same_as<Compare, std::less<>> or std::same_as<Compare, std::greater<>>);
1924

2025
} // namespace containers

source/containers/algorithms/sort/chunked_insertion_sort.cpp

Lines changed: 12 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,18 @@
33
// (See accompanying file LICENSE_1_0.txt or copy at
44
// http://www.boost.org/LICENSE_1_0.txt)
55

6+
module;
7+
8+
#include <bounded/assert.hpp>
9+
610
export module containers.algorithms.sort.chunked_insertion_sort;
711

812
import containers.algorithms.sort.merge_relocate_second_range;
913
import containers.algorithms.sort.small_size_optimized_sort;
10-
import containers.algorithms.sort.sort_exactly_1;
11-
import containers.algorithms.sort.sort_exactly_2;
12-
import containers.algorithms.sort.sort_exactly_3;
13-
import containers.algorithms.sort.sort_exactly_4;
14-
import containers.algorithms.sort.sort_exactly_5;
15-
import containers.algorithms.sort.sort_exactly_6;
14+
import containers.algorithms.sort.sort_exactly_n;
1615

1716
import containers.begin_end;
17+
import containers.integer_range;
1818
import containers.offset_type;
1919
import containers.range;
2020
import containers.size;
@@ -29,64 +29,31 @@ using namespace bounded::literal;
2929

3030
namespace containers {
3131

32-
template<typename Iterator>
33-
constexpr auto maybe_sort_exactly_n_relocate(Iterator it, auto const size, auto out, auto const compare) -> Iterator {
34-
constexpr auto max_size = numeric_traits::max_value<offset_type<Iterator>>;
35-
if constexpr (size <= max_size) {
36-
return ::containers::sort_exactly_n_relocate(it, size, out, compare);
37-
} else {
38-
std::unreachable();
39-
}
40-
}
41-
42-
template<typename Iterator>
43-
constexpr auto runtime_sort_exactly_n_relocate(Iterator it, auto const size, auto out, auto const compare) -> Iterator {
44-
auto do_sort = [&](auto const constant_size) {
45-
return ::containers::maybe_sort_exactly_n_relocate(it, constant_size, out, compare);
46-
};
47-
switch (static_cast<std::size_t>(size)) {
48-
case 0:
49-
return it;
50-
case 1:
51-
return do_sort(1_bi);
52-
case 2:
53-
return do_sort(2_bi);
54-
case 3:
55-
return do_sort(3_bi);
56-
case 4:
57-
return do_sort(4_bi);
58-
default:
59-
std::unreachable();
60-
}
61-
}
62-
6332
struct chunked_insertion_sort_t {
6433
template<typename Compare = std::less<>>
6534
static constexpr auto operator()(range auto && r, Compare const compare = Compare()) -> void {
6635
constexpr auto chunk_size = 4_bi;
36+
auto const size = ::containers::size(r);
6737
auto it = containers::begin(r);
68-
auto const last = containers::end(r);
69-
#if 1
70-
auto const initial_sort_size = bounded::min(chunk_size, ::containers::size(r));
38+
auto const initial_sort_size = size % chunk_size;
7139
::containers::small_size_optimized_sort(
7240
subrange(it, initial_sort_size),
7341
compare,
7442
[](auto &&, auto) { std::unreachable(); }
7543
);
7644
it += initial_sort_size;
77-
#endif
7845
auto buffer = containers::uninitialized_array<std::remove_reference_t<decltype(*it)>, chunk_size>();
79-
while (it != last) {
80-
auto const count = bounded::min(chunk_size, last - it);
81-
auto next_it = ::containers::runtime_sort_exactly_n_relocate(it, count, buffer.data(), compare);
46+
for (auto _ : containers::integer_range(size / chunk_size)) {
47+
auto const next_it = ::containers::sort_exactly_n_relocate(it, chunk_size, buffer.data(), compare);
8248
merge_relocate_second_range(
8349
subrange(containers::begin(r), it),
84-
subrange(buffer.data(), count),
50+
buffer,
8551
next_it,
8652
compare
8753
);
8854
it = next_it;
8955
}
56+
BOUNDED_ASSERT(it == containers::end(r));
9057
}
9158
};
9259
export constexpr auto chunked_insertion_sort = chunked_insertion_sort_t();

source/containers/algorithms/sort/dereference_all.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,35 @@
55

66
export module containers.algorithms.sort.dereference_all;
77

8+
import containers.array;
9+
import containers.iter_value_t;
10+
811
import bounded;
912
import std_module;
1013
export import tv;
1114

1215
namespace containers {
1316

14-
template<typename T, std::size_t>
15-
using type_ignore_size = T;
17+
export template<typename T, auto size>
18+
using element_refs = containers::array<std::reference_wrapper<T>, size>;
1619

1720
export template<typename Iterator>
1821
constexpr auto dereference_all(Iterator it, auto const size) {
19-
using T = decltype(*it);
20-
static_assert(std::is_lvalue_reference_v<T>);
21-
return [&]<std::size_t... indexes>(std::index_sequence<indexes...>) {
22-
// Must use brace initialization for guaranteed evaluation order. Uses a
23-
// tuple instead of an array (even though all types are the same)
24-
// because we cannot have arrays of references. Cannot use CTAD because
25-
// we want to deduce references for the values.
26-
return tv::tuple<type_ignore_size<T, indexes>..., Iterator>{(static_cast<void>(indexes), *it++)..., std::move(it)};
27-
}(bounded::make_index_sequence(size));
22+
auto const [...indexes] = bounded::index_sequence_struct(size);
23+
using value_type = iter_value_t<Iterator>;
24+
using elements_t = element_refs<value_type, size>;
25+
struct result_t {
26+
elements_t elements;
27+
Iterator last;
28+
};
29+
// Must use brace initialization for guaranteed evaluation order. Uses a
30+
// tuple instead of an array (even though all types are the same)
31+
// because we cannot have arrays of references. Cannot use CTAD because
32+
// we want to deduce references for the values.
33+
return result_t{
34+
elements_t{(static_cast<void>(indexes), *it++)...},
35+
std::move(it)
36+
};
2837
}
2938

3039
} // namespace containers

source/containers/algorithms/sort/fixed_size_merge_sort.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,7 @@
55

66
export module containers.algorithms.sort.fixed_size_merge_sort;
77

8-
import containers.algorithms.sort.sort_exactly_1;
9-
import containers.algorithms.sort.sort_exactly_2;
10-
import containers.algorithms.sort.sort_exactly_3;
11-
import containers.algorithms.sort.sort_exactly_4;
12-
import containers.algorithms.sort.sort_exactly_5;
8+
import containers.algorithms.sort.sort_exactly_n;
139

1410
import containers.algorithms.uninitialized;
1511

@@ -22,7 +18,7 @@ import std_module;
2218
using namespace bounded::literal;
2319
namespace containers {
2420

25-
export constexpr auto fixed_size_merge_sort(auto it, auto const size1, auto const size2, auto const compare) {
21+
export constexpr auto fixed_size_merge_sort(auto it, auto const size1, auto const size2, auto const compare) -> void {
2622
static_assert(size1 <= size2);
2723
static_assert(size1 > 0_bi);
2824
static_assert(size2 > 0_bi);
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Copyright David Stone 2023.
2+
// Distributed under the Boost Software License, Version 1.0.
3+
// (See accompanying file LICENSE_1_0.txt or copy at
4+
// http://www.boost.org/LICENSE_1_0.txt)
5+
6+
export module containers.algorithms.sort.low_high_ref;
7+
8+
namespace containers {
9+
10+
export template<typename T>
11+
struct low_high_ref {
12+
constexpr low_high_ref(T & low_, T & high_):
13+
low(low_),
14+
high(high_)
15+
{
16+
}
17+
18+
T & low;
19+
T & high;
20+
};
21+
22+
} // namespace containers

source/containers/algorithms/sort/small_size_optimized_sort.cpp

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,7 @@ module;
1010
export module containers.algorithms.sort.small_size_optimized_sort;
1111

1212
import containers.algorithms.sort.is_sorted;
13-
import containers.algorithms.sort.sort_exactly_1;
14-
import containers.algorithms.sort.sort_exactly_2;
15-
import containers.algorithms.sort.sort_exactly_3;
16-
import containers.algorithms.sort.sort_exactly_4;
17-
import containers.algorithms.sort.sort_exactly_5;
18-
import containers.algorithms.sort.sort_exactly_6;
13+
import containers.algorithms.sort.sort_exactly_n;
1914

2015
import containers.begin_end;
2116
import containers.range;
@@ -30,13 +25,12 @@ namespace containers {
3025

3126
using namespace bounded::literal;
3227

33-
export constexpr auto max_small_sort_size = 6_bi;
34-
3528
export template<range Range>
3629
inline constexpr auto small_size_optimized_sort(Range && r, auto const compare, auto const sort_large_range) -> void {
30+
constexpr auto min_size = numeric_traits::min_value<range_size_t<Range>>;
3731
constexpr auto max_size = numeric_traits::max_value<range_size_t<Range>>;
3832
auto do_sort = [&](auto const count) {
39-
if constexpr (count <= max_size) {
33+
if constexpr (min_size <= count and count <= max_size) {
4034
::containers::sort_exactly_n(containers::begin(r), count, compare);
4135
BOUNDED_ASSERT(::containers::is_sorted(r, compare));
4236
} else {
@@ -59,11 +53,8 @@ inline constexpr auto small_size_optimized_sort(Range && r, auto const compare,
5953
case 5:
6054
do_sort(5_bi);
6155
return;
62-
case 6:
63-
do_sort(6_bi);
64-
return;
6556
default:
66-
if constexpr (max_size >= max_small_sort_size) {
57+
if constexpr (max_size > 5_bi) {
6758
sort_large_range(r, compare);
6859
return;
6960
} else {

source/containers/algorithms/sort/sort.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import containers.algorithms.advance;
1818
import containers.algorithms.partition;
1919

2020
import containers.begin_end;
21+
import containers.iter_value_t;
2122
import containers.legacy_iterator;
2223
import containers.range;
2324
import containers.range_size_t;
@@ -54,12 +55,14 @@ constexpr auto introsort_impl(Depth const depth) {
5455
auto median = first + length / 2_bi;
5556
auto const before_last = containers::prev(last);
5657
auto median_of = [&](auto... its) {
57-
::containers::sort_exactly_n_impl(*its..., compare);
58+
::containers::sort_exactly_n_in_place_impl(
59+
*its...,
60+
compare
61+
);
5862
};
5963
if (length >= 1000_bi) {
6064
median_of(first, first + length / 4_bi, median, median + length / 4_bi, before_last);
6165
} else {
62-
static_assert(max_small_sort_size >= 3_bi);
6366
median_of(first, median, before_last);
6467
}
6568
median = iterator_partition(first, median, last, compare);

source/containers/algorithms/sort/sort_exactly_1.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,31 @@
55

66
export module containers.algorithms.sort.sort_exactly_1;
77

8+
import containers.iterator;
9+
810
import bounded;
11+
import std_module;
12+
13+
using namespace bounded::literal;
914

1015
namespace containers {
1116

12-
export constexpr auto sort_exactly_n([[maybe_unused]] auto const it, bounded::constant_t<1>, [[maybe_unused]] auto const compare) -> void {
17+
// Stable
18+
// Average compares: 0 (weight 0)
19+
// Max compares: 0
20+
// Sorted compares: 0
21+
// Reversed compares: 0
22+
export template<iterator Iterator, typename Compare>
23+
constexpr auto sort_exactly_n_in_place(Iterator, bounded::constant_t<1>, Compare) -> void {
1324
}
1425

15-
export constexpr auto sort_exactly_n_relocate(auto it, bounded::constant_t<1>, auto const out, [[maybe_unused]] auto const compare) {
16-
bounded::relocate_at(*out, *it);
17-
++it;
18-
return it;
26+
export template<typename T, typename Compare>
27+
constexpr auto sort_exactly_n_relocate_impl(
28+
T & x0,
29+
auto const out,
30+
Compare
31+
) -> void {
32+
bounded::relocate_at(*out, x0);
1933
}
2034

2135
} // namespace containers

source/containers/algorithms/sort/sort_exactly_2.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,43 @@
55

66
export module containers.algorithms.sort.sort_exactly_2;
77

8-
import containers.algorithms.sort.dereference_all;
98
import containers.algorithms.sort.relocate_in_order;
109

1110
import bounded;
1211
import std_module;
1312

13+
using namespace bounded::literal;
14+
1415
namespace containers {
1516

16-
// stable, 1 compare (average 1), 0-3 relocates (average 1.5)
17-
export constexpr auto sort_exactly_n_impl(auto & x0, auto & x1, auto const compare) -> void {
17+
// Stable
18+
// Average compares: 1 (weight 1)
19+
// Max compares: 1
20+
// Sorted compares: 1
21+
// Reversed compares: 1
22+
export template<typename T>
23+
constexpr auto sort_exactly_n_in_place_impl(
24+
T & x0,
25+
T & x1,
26+
auto const compare
27+
) -> void {
1828
if (compare(x1, x0)) {
1929
std::ranges::swap(x0, x1);
2030
}
2131
}
2232

23-
export constexpr auto sort_exactly_n(auto it, bounded::constant_t<2> const size, auto const compare) -> void {
24-
auto [x0, x1, last] = ::containers::dereference_all(std::move(it), size);
25-
sort_exactly_n_impl(x0, x1, compare);
26-
}
27-
28-
export constexpr auto sort_exactly_n_relocate(auto it, bounded::constant_t<2> const size, auto const out, auto const compare) {
29-
auto [x0, x1, last] = ::containers::dereference_all(std::move(it), size);
33+
export template<typename T>
34+
constexpr auto sort_exactly_n_relocate_impl(
35+
T & x0,
36+
T & x1,
37+
auto const out,
38+
auto const compare
39+
) -> void {
3040
if (compare(x1, x0)) {
31-
relocate_in_order(out, x1, x0);
41+
::containers::relocate_in_order(out, x1, x0);
3242
} else {
33-
relocate_in_order(out, x0, x1);
43+
::containers::relocate_in_order(out, x0, x1);
3444
}
35-
return last;
3645
}
3746

3847
} // namespace containers

0 commit comments

Comments
 (0)