Skip to content

Conversation

bobhan1
Copy link
Contributor

@bobhan1 bobhan1 commented Sep 29, 2025

pick #56395

…armup` due to capture by reference (apache#56395)

### What problem does this PR solve?
introduced in apache#54611

```
(gdb) bt
#0  __GI___pthread_sigmask (how=2, newmask=<optimized out>, oldmask=0x0) at ./nptl/pthread_sigmask.c:43
apache#1  0x00007f7aa6b5d71e in PosixSignals::chained_handler(int, siginfo*, void*) [clone .part.0] () from /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so
apache#2  0x00007f7aa6b5e206 in JVM_handle_linux_signal () from /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so
apache#3  <signal handler called>
apache#4  0x000055e20050d443 in std::_Hashtable<doris::RowsetId, std::pair<doris::RowsetId const, doris::CloudTablet::RowsetWarmUpInfo>, std::allocator<std::pair<doris::RowsetId const, doris::CloudTablet::RowsetWarmUpInfo> >, std::__detail::_Select1st, std::equal_to<doris::RowsetId>, std::hash<doris::RowsetId>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<true, false, true> >::_M_find_before_node (this=0x7f79e69acde0,
    __bkt=4186920012728959759, __k=..., __code=11200260987994981938) at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/hashtable.h:2205
apache#5  std::_Hashtable<doris::RowsetId, std::pair<doris::RowsetId const, doris::CloudTablet::RowsetWarmUpInfo>, std::allocator<std::pair<doris::RowsetId const, doris::CloudTablet::RowsetWarmUpInfo> >, std::__detail::_Select1st, std::equal_to<doris::RowsetId>, std::hash<doris::RowsetId>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<true, false, true> >::_M_locate (this=this@entry=0x7f79e69acde0, __k=...)
    at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/hashtable.h:2283
apache#6  0x000055e200505a8d in std::_Hashtable<doris::RowsetId, std::pair<doris::RowsetId const, doris::CloudTablet::RowsetWarmUpInfo>, std::allocator<std::pair<doris::RowsetId const, doris::CloudTablet::RowsetWarmUpInfo> >, std::__detail::_Select1st, std::equal_to<doris::RowsetId>, std::hash<doris::RowsetId>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<true, false, true> >::find (this=0x7f79e69acde0, __k=...)
    at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/hashtable.h:1929
apache#7  std::unordered_map<doris::RowsetId, doris::CloudTablet::RowsetWarmUpInfo, std::hash<doris::RowsetId>, std::equal_to<doris::RowsetId>, std::allocator<std::pair<doris::RowsetId const, doris::CloudTablet::RowsetWarmUpInfo> > >::contains (this=0x7f79e69acde0, __x=...)
    at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/unordered_map.h:999
apache#8  doris::CloudTablet::complete_rowset_segment_warmup (this=<optimized out>, rowset_id=..., status=..., segment_num=1, inverted_idx_num=0) at /home/zcp/repo_center/doris_master/doris/be/src/cloud/cloud_tablet.cpp:1692
apache#9  0x000055e2005338ba in doris::CloudWarmUpManager::handle_jobs()::$_0::operator()(doris::Status) const (this=0x7f79e8bebad0, st=...) at /home/zcp/repo_center/doris_master/doris/be/src/cloud/cloud_warm_up_manager.cpp:243
apache#10 std::__invoke_impl<void, doris::CloudWarmUpManager::handle_jobs()::$_0&, doris::Status>(std::__invoke_other, doris::CloudWarmUpManager::handle_jobs()::$_0&, doris::Status&&) (__f=..., __args=...)
    at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/invoke.h:63
apache#11 std::__invoke_r<void, doris::CloudWarmUpManager::handle_jobs()::$_0&, doris::Status>(doris::CloudWarmUpManager::handle_jobs()::$_0&, doris::Status&&) (__fn=..., __args=...)
    at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/invoke.h:113
apache#12 std::_Function_handler<void (doris::Status), doris::CloudWarmUpManager::handle_jobs()::$_0>::_M_invoke(std::_Any_data const&, doris::Status&&) (__functor=..., __args=...)
    at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/std_function.h:292
apache#13 0x000055e200533310 in std::function<void (doris::Status)>::operator()(doris::Status) const (this=0x7f795a3ff1d0, __args=...) at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/std_function.h:593
apache#14 doris::CloudWarmUpManager::submit_download_tasks(std::filesystem::__cxx11::path, long, std::shared_ptr<doris::io::FileSystem>, long, std::shared_ptr<bthread::CountdownEvent>, bool, std::function<void (doris::Status)>)::$_0::operator()(doris::Status) const (
    this=0x7f79f0710b60, st=...) at /home/zcp/repo_center/doris_master/doris/be/src/cloud/cloud_warm_up_manager.cpp:149
apache#15 std::__invoke_impl<void, doris::CloudWarmUpManager::submit_download_tasks(std::filesystem::__cxx11::path, long, std::shared_ptr<doris::io::FileSystem>, long, std::shared_ptr<bthread::CountdownEvent>, bool, std::function<void (doris::Status)>)::$_0&, doris::Status>(std::__invoke_other, doris::CloudWarmUpManager::submit_download_tasks(std::filesystem::__cxx11::path, long, std::shared_ptr<doris::io::FileSystem>, long, std::shared_ptr<bthread::CountdownEvent>, bool, std::function<void (doris::Status)>)::$_0&, doris::Status&&) (
    __f=..., __args=...) at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/invoke.h:63
apache#16 std::__invoke_r<void, doris::CloudWarmUpManager::submit_download_tasks(std::filesystem::__cxx11::path, long, std::shared_ptr<doris::io::FileSystem>, long, std::shared_ptr<bthread::CountdownEvent>, bool, std::function<void (doris::Status)>)::$_0&, doris::Status>(doris::CloudWarmUpManager::submit_download_tasks(std::filesystem::__cxx11::path, long, std::shared_ptr<doris::io::FileSystem>, long, std::shared_ptr<bthread::CountdownEvent>, bool, std::function<void (doris::Status)>)::$_0&, doris::Status&&) (__fn=..., __args=...)
    at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/invoke.h:113
apache#17 std::_Function_handler<void (doris::Status), doris::CloudWarmUpManager::submit_download_tasks(std::filesystem::__cxx11::path, long, std::shared_ptr<doris::io::FileSystem>, long, std::shared_ptr<bthread::CountdownEvent>, bool, std::function<void (doris::Status)>)::$_0>::_M_invoke(std::_Any_data const&, doris::Status&&) (__functor=..., __args=...) at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/std_function.h:292
apache#18 0x000055e200469cb1 in std::function<void (doris::Status)>::operator()(doris::Status) const (this=0x7f796941cee0, __args=...) at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/std_function.h:593
apache#19 doris::io::FileCacheBlockDownloader::download_segment_file (this=<optimized out>, meta=...) at /home/zcp/repo_center/doris_master/doris/be/src/io/cache/block_file_cache_downloader.cpp:297
apache#20 0x000055e1fbfcbe85 in doris::ThreadPool::dispatch_thread (this=0x7f7995a1da00) at /home/zcp/repo_center/doris_master/doris/be/src/util/threadpool.cpp:614
apache#21 0x000055e1fbfc0fac in std::function<void ()>::operator()() const (this=0x7f796941cee0) at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/std_function.h:593
apache#22 doris::Thread::supervise_thread (arg=0x7f79f13de010) at /home/zcp/repo_center/doris_master/doris/be/src/util/thread.cpp:460
apache#23 0x00007f7aa5bf8ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
apache#24 0x00007f7aa5c8a850 in __closefrom_fallback (from=1685031872, dirfd_fallback=<optimized out>) at ../sysdeps/unix/sysv/linux/closefrom_fallback.c:45
apache#25 0x0000000000000000 in ?? ()
```

Problem Summary:

### Release note

None

### Check List (For Author)

- Test <!-- At least one of them must be included. -->
    - [ ] Regression test
    - [ ] Unit Test
    - [ ] Manual test (add detailed scripts or steps below)
    - [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
        - [ ] Previous test can cover this change.
        - [ ] No code files have been changed.
        - [ ] Other reason <!-- Add your reason?  -->

- Behavior changed:
    - [ ] No.
    - [ ] Yes. <!-- Explain the behavior change -->

- Does this need documentation?
    - [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
apache/doris-website#1214 -->

### Check List (For Reviewer who merge this PR)

- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
@bobhan1 bobhan1 requested a review from morrySnow as a code owner September 29, 2025 06:42
@Thearas
Copy link
Contributor

Thearas commented Sep 29, 2025

Thank you for your contribution to Apache Doris.
Don't know what should be done next? See How to process your PR.

Please clearly describe your PR:

  1. What problem was fixed (it's best to include specific error reporting information). How it was fixed.
  2. Which behaviors were modified. What was the previous behavior, what is it now, why was it modified, and what possible impacts might there be.
  3. What features were added. Why was this function added?
  4. Which code was refactored and why was this part of the code refactored?
  5. Which functions were optimized and what is the difference before and after the optimization?

@bobhan1 bobhan1 changed the title [Fix](warmup) Fix coredump in CloudTablet::complete_rowset_segment_warmup due to capture by reference (#56395) branch-3.1: [Fix](warmup) Fix coredump in CloudTablet::complete_rowset_segment_warmup due to capture by reference (#56395) Sep 29, 2025
@bobhan1
Copy link
Contributor Author

bobhan1 commented Sep 29, 2025

run buildall

@doris-robot
Copy link

TPC-H: Total hot run time: 32836 ms
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/tpch-tools
Tpch sf100 test result on commit aecbc99b48e31bae255f57c1f128f42dbe29dc66, data reload: false

------ Round 1 ----------------------------------
q1	17771	5505	5491	5491
q2	2030	412	275	275
q3	12447	1259	745	745
q4	10283	895	456	456
q5	8834	2408	2157	2157
q6	185	166	134	134
q7	899	752	606	606
q8	9330	1476	1176	1176
q9	5254	5025	4960	4960
q10	6777	2301	1810	1810
q11	489	284	267	267
q12	340	359	208	208
q13	17779	3622	3021	3021
q14	236	218	206	206
q15	533	465	479	465
q16	411	433	373	373
q17	599	870	353	353
q18	6994	6518	6455	6455
q19	1343	973	561	561
q20	337	347	205	205
q21	2822	2128	1948	1948
q22	1064	1050	964	964
Total cold run time: 106757 ms
Total hot run time: 32836 ms

----- Round 2, with runtime_filter_mode=off -----
q1	5580	5637	5530	5530
q2	238	337	245	245
q3	2276	2653	2336	2336
q4	1366	1823	1355	1355
q5	4438	5057	5029	5029
q6	172	166	134	134
q7	2071	2001	1804	1804
q8	2674	2793	2743	2743
q9	7297	7272	7266	7266
q10	3044	3249	2714	2714
q11	573	504	491	491
q12	661	777	638	638
q13	3403	3805	3160	3160
q14	277	289	279	279
q15	548	472	473	472
q16	434	476	429	429
q17	1252	1745	1231	1231
q18	7819	7476	7448	7448
q19	850	1139	1115	1115
q20	2041	2044	1907	1907
q21	5329	4929	4426	4426
q22	1063	1114	1054	1054
Total cold run time: 53406 ms
Total hot run time: 51806 ms

@doris-robot
Copy link

TPC-DS: Total hot run time: 192943 ms
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/tpcds-tools
TPC-DS sf100 test result on commit aecbc99b48e31bae255f57c1f128f42dbe29dc66, data reload: false

query1	975	415	396	396
query2	6226	1978	1852	1852
query3	8697	203	201	201
query4	33710	23824	23552	23552
query5	3668	587	452	452
query6	306	210	186	186
query7	4209	487	317	317
query8	317	258	240	240
query9	9457	2604	2589	2589
query10	463	325	270	270
query11	18465	15418	15311	15311
query12	160	109	108	108
query13	1546	532	394	394
query14	9750	7496	6888	6888
query15	221	185	174	174
query16	7988	651	477	477
query17	1533	794	598	598
query18	2119	434	326	326
query19	211	213	162	162
query20	131	125	129	125
query21	208	129	107	107
query22	4571	4698	4408	4408
query23	35222	34721	34141	34141
query24	7149	2683	2679	2679
query25	508	493	412	412
query26	740	274	165	165
query27	1942	479	362	362
query28	5186	2217	2196	2196
query29	675	591	469	469
query30	238	189	159	159
query31	973	938	816	816
query32	87	59	59	59
query33	489	371	301	301
query34	738	862	523	523
query35	778	817	736	736
query36	1019	1082	942	942
query37	118	91	66	66
query38	3985	4038	3949	3949
query39	1533	1500	1455	1455
query40	203	122	102	102
query41	56	50	47	47
query42	118	108	103	103
query43	518	531	470	470
query44	1326	841	839	839
query45	188	181	170	170
query46	892	1071	674	674
query47	1938	1950	1918	1918
query48	418	446	372	372
query49	730	498	430	430
query50	689	689	434	434
query51	7230	7295	7236	7236
query52	105	100	96	96
query53	234	268	199	199
query54	559	539	477	477
query55	77	76	75	75
query56	295	275	261	261
query57	1257	1249	1245	1245
query58	228	216	233	216
query59	3053	3113	3115	3113
query60	316	288	284	284
query61	117	114	118	114
query62	792	765	693	693
query63	234	193	196	193
query64	3228	1102	740	740
query65	3420	3311	3326	3311
query66	889	433	324	324
query67	16328	15730	15660	15660
query68	4299	837	565	565
query69	481	304	275	275
query70	1180	1135	1163	1135
query71	358	304	275	275
query72	5883	3910	4037	3910
query73	627	751	359	359
query74	10291	9419	9270	9270
query75	3205	3189	2673	2673
query76	2913	1130	760	760
query77	539	363	282	282
query78	10493	10435	9597	9597
query79	2916	902	599	599
query80	698	528	439	439
query81	502	261	224	224
query82	219	118	93	93
query83	166	158	139	139
query84	243	103	80	80
query85	807	384	309	309
query86	403	319	308	308
query87	4266	4375	4229	4229
query88	5148	2385	2372	2372
query89	398	324	295	295
query90	1762	200	191	191
query91	135	146	106	106
query92	65	55	49	49
query93	2582	863	540	540
query94	664	419	302	302
query95	351	284	267	267
query96	484	616	277	277
query97	3225	3293	3160	3160
query98	226	216	206	206
query99	1311	1406	1298	1298
Total cold run time: 288589 ms
Total hot run time: 192943 ms

@doris-robot
Copy link

ClickBench: Total hot run time: 28.43 s
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/clickbench-tools
ClickBench test result on commit aecbc99b48e31bae255f57c1f128f42dbe29dc66, data reload: false

query1	0.04	0.03	0.03
query2	0.07	0.04	0.03
query3	0.24	0.07	0.07
query4	1.62	0.10	0.10
query5	0.52	0.54	0.52
query6	1.12	0.76	0.73
query7	0.02	0.01	0.01
query8	0.04	0.03	0.03
query9	0.57	0.49	0.51
query10	0.56	0.54	0.56
query11	0.14	0.10	0.10
query12	0.16	0.11	0.11
query13	0.61	0.61	0.60
query14	0.77	0.81	0.78
query15	0.84	0.82	0.83
query16	0.39	0.38	0.38
query17	1.09	1.06	1.09
query18	0.24	0.23	0.23
query19	1.94	1.89	1.77
query20	0.01	0.01	0.01
query21	15.38	0.92	0.57
query22	0.74	0.78	0.70
query23	15.08	1.42	0.54
query24	2.83	1.53	0.45
query25	0.12	0.10	0.08
query26	0.42	0.16	0.14
query27	0.04	0.05	0.04
query28	13.02	1.08	0.44
query29	12.57	3.91	3.23
query30	0.25	0.10	0.06
query31	2.80	0.60	0.40
query32	3.22	0.55	0.46
query33	3.01	3.04	3.02
query34	16.21	5.23	4.55
query35	4.62	4.55	4.55
query36	0.64	0.49	0.48
query37	0.09	0.06	0.06
query38	0.04	0.03	0.03
query39	0.03	0.02	0.02
query40	0.16	0.14	0.13
query41	0.08	0.02	0.02
query42	0.03	0.02	0.02
query43	0.04	0.03	0.03
Total cold run time: 102.41 s
Total hot run time: 28.43 s

Copy link
Contributor

@dataroaring dataroaring left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@morrySnow morrySnow changed the title branch-3.1: [Fix](warmup) Fix coredump in CloudTablet::complete_rowset_segment_warmup due to capture by reference (#56395) branch-3.1: [Fix](warmup) Fix coredump in CloudTablet::complete_rowset_segment_warmup due to capture by reference #56395 Oct 10, 2025
@morrySnow morrySnow merged commit 73389f7 into apache:branch-3.1 Oct 10, 2025
22 of 23 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants