Skip to content

Commit 80d10b0

Browse files
Add Chromosome based file splitting (#9)
* Adding chromosome base file splitting Signed-off-by: AdityaPandeyCN <[email protected]> clang format Signed-off-by: AdityaPandeyCN <[email protected]> code organization Signed-off-by: AdityaPandeyCN <[email protected]> clang changes Signed-off-by: AdityaPandeyCN <[email protected]> Add region query benchmark (#8) * query performance Signed-off-by: AdityaPandeyCN <[email protected]> * clang format Signed-off-by: AdityaPandeyCN <[email protected]> * code organization Signed-off-by: AdityaPandeyCN <[email protected]> * clang changes Signed-off-by: AdityaPandeyCN <[email protected]> --------- Signed-off-by: AdityaPandeyCN <[email protected]> add chromosome based file splitting Signed-off-by: AdityaPandeyCN <[email protected]> delete example sam file Signed-off-by: AdityaPandeyCN <[email protected]> clang changes Signed-off-by: AdityaPandeyCN <[email protected]> test file changes Signed-off-by: AdityaPandeyCN <[email protected]> clang changes Signed-off-by: AdityaPandeyCN <[email protected]> * added bam vs ram file benchmark for chromosome based file splitting Signed-off-by: AdityaPandeyCN <[email protected]> * clang format changes Signed-off-by: AdityaPandeyCN <[email protected]> * made benchmarks fair Signed-off-by: AdityaPandeyCN <[email protected]> * clang changes Signed-off-by: AdityaPandeyCN <[email protected]> * parallel write code Signed-off-by: AdityaPandeyCN <[email protected]> * version update Signed-off-by: AdityaPandeyCN <[email protected]> * version update Signed-off-by: AdityaPandeyCN <[email protected]> * use parallel writer Signed-off-by: AdityaPandeyCN <[email protected]> * clang changes Signed-off-by: AdityaPandeyCN <[email protected]> * clang changes Signed-off-by: AdityaPandeyCN <[email protected]> * samtools threading Signed-off-by: AdityaPandeyCN <[email protected]> * clang format changes Signed-off-by: AdityaPandeyCN <[email protected]> --------- Signed-off-by: AdityaPandeyCN <[email protected]>
1 parent 252c3ba commit 80d10b0

File tree

11 files changed

+544
-79
lines changed

11 files changed

+544
-79
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616

1717
- name: Install ROOT
1818
run: |
19-
ROOT_URL="https://root.cern/download/root_v6.34.06.Linux-ubuntu24.04-x86_64-gcc13.3.tar.gz"
19+
ROOT_URL="https://root.cern/download/root_v6.36.00.Linux-ubuntu24.04-x86_64-gcc13.3.tar.gz"
2020
wget -O root.tar.gz $ROOT_URL
2121
tar -xzf root.tar.gz -C /opt/
2222
echo "/opt/root/bin" >> $GITHUB_PATH

benchmark/CMakeLists.txt

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,15 @@ ROOT_EXECUTABLE(region_query_benchmark
4848
ramtools_views
4949
)
5050

51-
install(TARGETS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark
51+
ROOT_EXECUTABLE(chromosome_split_benchmark
52+
chromosome_split_benchmark.cxx
53+
LIBRARIES
54+
benchmark::benchmark
55+
ramcore
56+
sam_generator
57+
)
58+
59+
install(TARGETS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark chromosome_split_benchmark
5260
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
5361
)
5462

@@ -61,7 +69,10 @@ add_custom_target(benchmark
6169
COMMAND ${CMAKE_COMMAND} -E echo ""
6270
COMMAND ${CMAKE_COMMAND} -E echo "=== Region Query Benchmark ==="
6371
COMMAND region_query_benchmark
64-
DEPENDS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark
72+
COMMAND ${CMAKE_COMMAND} -E echo ""
73+
COMMAND ${CMAKE_COMMAND} -E echo "=== Chromosome Split Benchmark ==="
74+
COMMAND chromosome_split_benchmark
75+
DEPENDS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark chromosome_split_benchmark
6576
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
6677
COMMENT "Running all RAM tools benchmarks"
6778
)
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#include <benchmark/benchmark.h>
2+
#include "ramcore/SamToNTuple.h"
3+
#include "generate_sam_benchmark.h"
4+
#include <cstdio>
5+
#include <cstdlib>
6+
#include <filesystem>
7+
#include <fstream>
8+
#include <vector>
9+
#include <string>
10+
#include <thread>
11+
12+
static void CleanupFiles(const std::string &pattern)
13+
{
14+
for (const auto &entry : std::filesystem::directory_iterator(".")) {
15+
if (entry.path().filename().string().find(pattern) != std::string::npos) {
16+
std::remove(entry.path().c_str());
17+
}
18+
}
19+
}
20+
21+
static size_t GetTotalFileSize(const std::string &pattern)
22+
{
23+
size_t total = 0;
24+
for (const auto &entry : std::filesystem::directory_iterator(".")) {
25+
if (entry.path().filename().string().find(pattern) != std::string::npos) {
26+
total += std::filesystem::file_size(entry.path());
27+
}
28+
}
29+
return total;
30+
}
31+
32+
static std::vector<std::string> GetChromosomes(const std::string &sam_file)
33+
{
34+
std::vector<std::string> chroms;
35+
std::ifstream f(sam_file);
36+
std::string line;
37+
38+
while (std::getline(f, line) && line[0] == '@') {
39+
if (line.find("@SQ\tSN:") == 0) {
40+
size_t start = 7;
41+
size_t end = line.find('\t', start);
42+
chroms.push_back(line.substr(start, end - start));
43+
}
44+
}
45+
return chroms;
46+
}
47+
48+
static void BM_SamtoolsSplit(benchmark::State &state)
49+
{
50+
int num_reads = state.range(0);
51+
std::string sam_file = "bench_st_" + std::to_string(num_reads) + ".sam";
52+
53+
GenerateSAMFile(sam_file, num_reads);
54+
auto chromosomes = GetChromosomes(sam_file);
55+
56+
for (auto _ : state) {
57+
std::string bam_file = "bench_st_tmp.bam";
58+
std::string sorted_bam = "bench_st_sorted.bam";
59+
60+
system(("samtools view -bS " + sam_file + " -o " + bam_file + " 2>/dev/null").c_str());
61+
62+
system(("samtools sort " + bam_file + " -o " + sorted_bam + " 2>/dev/null").c_str());
63+
64+
system(("samtools index " + sorted_bam + " 2>/dev/null").c_str());
65+
66+
for (const auto &chr : chromosomes) {
67+
std::string cmd = "samtools view -b " + sorted_bam + " " + chr + " > bench_st_" + chr + ".bam 2>/dev/null";
68+
system(cmd.c_str());
69+
}
70+
71+
state.counters["size_MB"] = GetTotalFileSize("bench_st_chr") / (1024.0 * 1024.0);
72+
73+
CleanupFiles("bench_st_chr");
74+
std::remove(bam_file.c_str());
75+
std::remove(sorted_bam.c_str());
76+
std::remove((sorted_bam + ".bai").c_str());
77+
}
78+
79+
std::remove(sam_file.c_str());
80+
state.counters["reads/s"] = benchmark::Counter(num_reads, benchmark::Counter::kIsRate);
81+
}
82+
83+
static void BM_SamtoolsSplitThreaded(benchmark::State &state)
84+
{
85+
int num_reads = state.range(0);
86+
int num_threads = state.range(1);
87+
std::string sam_file = "bench_st_mt_" + std::to_string(num_reads) + ".sam";
88+
89+
GenerateSAMFile(sam_file, num_reads);
90+
auto chromosomes = GetChromosomes(sam_file);
91+
92+
for (auto _ : state) {
93+
std::string bam_file = "bench_st_mt_tmp.bam";
94+
std::string sorted_bam = "bench_st_mt_sorted.bam";
95+
96+
std::string cmd =
97+
"samtools view -@ " + std::to_string(num_threads) + " -bS " + sam_file + " -o " + bam_file + " 2>/dev/null";
98+
system(cmd.c_str());
99+
100+
cmd = "samtools sort -@ " + std::to_string(num_threads) + " -m 1G " + bam_file + " -o " + sorted_bam +
101+
" 2>/dev/null";
102+
system(cmd.c_str());
103+
104+
cmd = "samtools index -@ " + std::to_string(num_threads) + " " + sorted_bam + " 2>/dev/null";
105+
system(cmd.c_str());
106+
107+
std::vector<std::thread> threads;
108+
for (const auto &chr : chromosomes) {
109+
threads.emplace_back([&sorted_bam, &chr]() {
110+
std::string cmd =
111+
"samtools view -@ 2 -b " + sorted_bam + " " + chr + " > bench_st_mt_" + chr + ".bam 2>/dev/null";
112+
system(cmd.c_str());
113+
});
114+
115+
if (threads.size() >= num_threads) {
116+
for (auto &t : threads) {
117+
t.join();
118+
}
119+
threads.clear();
120+
}
121+
}
122+
123+
for (auto &t : threads) {
124+
t.join();
125+
}
126+
127+
state.counters["size_MB"] = GetTotalFileSize("bench_st_mt_chr") / (1024.0 * 1024.0);
128+
state.counters["threads"] = num_threads;
129+
130+
CleanupFiles("bench_st_mt_chr");
131+
std::remove(bam_file.c_str());
132+
std::remove(sorted_bam.c_str());
133+
std::remove((sorted_bam + ".bai").c_str());
134+
}
135+
136+
std::remove(sam_file.c_str());
137+
state.counters["reads/s"] = benchmark::Counter(num_reads, benchmark::Counter::kIsRate);
138+
}
139+
140+
static void BM_ChromosomeSplitThreads(benchmark::State &state)
141+
{
142+
int num_reads = state.range(0);
143+
int num_threads = state.range(1);
144+
std::string sam_file = "bench_split_par_" + std::to_string(num_reads) + ".sam";
145+
146+
GenerateSAMFile(sam_file, num_reads);
147+
148+
FILE *old_stdout = stdout;
149+
FILE *old_stderr = stderr;
150+
151+
for (auto _ : state) {
152+
stdout = fopen("/dev/null", "w");
153+
stderr = fopen("/dev/null", "w");
154+
155+
samtoramntuple_split_by_chromosome(sam_file.c_str(), "bench_split_par_out", 505, 1, num_threads);
156+
157+
fclose(stdout);
158+
fclose(stderr);
159+
stdout = old_stdout;
160+
stderr = old_stderr;
161+
162+
state.counters["size_MB"] = GetTotalFileSize("bench_split_par_out_") / (1024.0 * 1024.0);
163+
state.counters["threads"] = num_threads;
164+
CleanupFiles("bench_split_par_out_");
165+
}
166+
167+
std::remove(sam_file.c_str());
168+
state.counters["reads/s"] = benchmark::Counter(num_reads, benchmark::Counter::kIsRate);
169+
}
170+
171+
BENCHMARK(BM_SamtoolsSplit)->Arg(100000)->Arg(500000)->Arg(1000000)->Unit(benchmark::kMillisecond);
172+
173+
BENCHMARK(BM_SamtoolsSplitThreaded)
174+
->Args({100000, 2})
175+
->Args({100000, 4})
176+
->Args({500000, 2})
177+
->Args({500000, 4})
178+
->Args({1000000, 2})
179+
->Args({1000000, 4})
180+
->Unit(benchmark::kMillisecond);
181+
182+
BENCHMARK(BM_ChromosomeSplitThreads)
183+
->Args({100000, 2})
184+
->Args({100000, 4})
185+
->Args({500000, 2})
186+
->Args({500000, 4})
187+
->Args({1000000, 2})
188+
->Args({1000000, 4})
189+
->Unit(benchmark::kMillisecond);
190+
191+
BENCHMARK_MAIN();

inc/ramcore/SamToNTuple.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
#ifndef RAMCORE_SAMTONTUPLE_H
22
#define RAMCORE_SAMTONTUPLE_H
33

4-
#include <cstdint>
4+
#include <cstdint>
5+
#include <string>
56

67
void samtoramntuple(const char *datafile,
78
const char *treefile,
89
bool index, bool split, bool cache,
910
int compression_algorithm,
1011
uint32_t quality_policy);
1112

12-
#endif // RAMCORE_SAMTONTUPLE_H
13+
void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm,
14+
uint32_t quality_policy, int num_threads = 4);
1315

16+
#endif

inc/rntuple/RAMNTupleRecord.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@
1818
#include <memory>
1919
#include <cstdint>
2020

21-
namespace ROOT::Experimental {
21+
namespace ROOT {
2222
class RNTupleModel;
2323
class RNTupleWriter;
2424
class RNTupleReader;
25-
} // namespace ROOT::Experimental
25+
} // namespace ROOT
2626

2727
class RAMNTupleRefs;
2828
class RAMNTupleIndex;
@@ -222,15 +222,15 @@ class RAMNTupleRecord {
222222
static RAMNTupleIndex *GetIndex() { return fgIndex.get(); }
223223

224224
// File I/O
225-
static std::unique_ptr<ROOT::Experimental::RNTupleReader>
225+
static std::unique_ptr<ROOT::RNTupleReader>
226226
OpenRAMFile(const std::string &filename, const std::string &ntupleName = "RAM");
227227
static void WriteAllRefs(TFile &file);
228228
static void ReadAllRefs(const std::string &filename = "");
229229
static void WriteIndex(TFile &file);
230230
static void ReadIndex(const std::string &filename = "");
231231

232232
// RNTuple model creation
233-
static std::unique_ptr<ROOT::Experimental::RNTupleModel> MakeModel();
233+
static std::unique_ptr<ROOT::RNTupleModel> MakeModel();
234234

235235
void SetCompressionMode(uint32_t flags) { compression_flags = flags; }
236236

0 commit comments

Comments
 (0)