Skip to content

Commit 632d4b6

Browse files
Adding chromosome base file splitting
Signed-off-by: AdityaPandeyCN <[email protected]> clang format Signed-off-by: AdityaPandeyCN <[email protected]> code organization Signed-off-by: AdityaPandeyCN <[email protected]> clang changes Signed-off-by: AdityaPandeyCN <[email protected]> Add region query benchmark (compiler-research#8) * query performance Signed-off-by: AdityaPandeyCN <[email protected]> * clang format Signed-off-by: AdityaPandeyCN <[email protected]> * code organization Signed-off-by: AdityaPandeyCN <[email protected]> * clang changes Signed-off-by: AdityaPandeyCN <[email protected]> --------- Signed-off-by: AdityaPandeyCN <[email protected]> add chromosome based file splitting Signed-off-by: AdityaPandeyCN <[email protected]> delete example sam file Signed-off-by: AdityaPandeyCN <[email protected]> clang changes Signed-off-by: AdityaPandeyCN <[email protected]> test file changes Signed-off-by: AdityaPandeyCN <[email protected]> clang changes Signed-off-by: AdityaPandeyCN <[email protected]>
1 parent 30c92b5 commit 632d4b6

File tree

9 files changed

+338
-41
lines changed

9 files changed

+338
-41
lines changed

benchmark/CMakeLists.txt

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,19 @@ target_include_directories(sam_generator PUBLIC
99
${CMAKE_CURRENT_SOURCE_DIR}
1010
)
1111

12-
target_link_libraries(sam_generator PRIVATE benchmark::benchmark)
13-
add_dependencies(sam_generator benchmark::benchmark)
12+
target_link_libraries(sam_generator PRIVATE
13+
benchmark::benchmark
14+
)
15+
16+
add_library(ramtools_views STATIC
17+
${CMAKE_SOURCE_DIR}/tools/ramview.cxx
18+
${CMAKE_SOURCE_DIR}/tools/ramntupleview.cxx
19+
)
20+
21+
target_link_libraries(ramtools_views PRIVATE
22+
ramcore
23+
ROOT::TreePlayer
24+
)
1425

1526
ROOT_EXECUTABLE(sam_to_ram_benchmark
1627
sam_to_ram_benchmark.cxx
@@ -28,14 +39,30 @@ ROOT_EXECUTABLE(conversion_time_benchmark
2839
sam_generator
2940
)
3041

31-
install(TARGETS sam_to_ram_benchmark conversion_time_benchmark
42+
ROOT_EXECUTABLE(region_query_benchmark
43+
region_query_benchmark.cxx
44+
LIBRARIES
45+
benchmark::benchmark
46+
ramcore
47+
sam_generator
48+
ramtools_views
49+
)
50+
51+
install(TARGETS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark
3252
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
3353
)
3454

3555
add_custom_target(benchmark
56+
COMMAND ${CMAKE_COMMAND} -E echo "=== SAM to RAM Benchmark ==="
3657
COMMAND sam_to_ram_benchmark
58+
COMMAND ${CMAKE_COMMAND} -E echo ""
59+
COMMAND ${CMAKE_COMMAND} -E echo "=== Conversion Time Benchmark ==="
3760
COMMAND conversion_time_benchmark
38-
DEPENDS sam_to_ram_benchmark conversion_time_benchmark
61+
COMMAND ${CMAKE_COMMAND} -E echo ""
62+
COMMAND ${CMAKE_COMMAND} -E echo "=== Region Query Benchmark ==="
63+
COMMAND region_query_benchmark
64+
DEPENDS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark
3965
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
66+
COMMENT "Running all RAM tools benchmarks"
4067
)
4168

benchmark/benchmark_utils.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#pragma once
2+
3+
#ifdef _WIN32
4+
#define NULL_DEVICE "NUL"
5+
#else
6+
#define NULL_DEVICE "/dev/null"
7+
#endif
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#include <benchmark/benchmark.h>
2+
#include "generate_sam_benchmark.h"
3+
#include "benchmark_utils.h"
4+
#include "ramcore/SamToTTree.h"
5+
#include "ramcore/SamToNTuple.h"
6+
#include <string>
7+
#include <cstdio>
8+
9+
void ramview(const char *file, const char *query, bool cache = true, bool perfstats = false,
10+
const char *perfstatsfilename = "perf.root");
11+
void ramntupleview(const char *file, const char *query, bool cache = true, bool perfstats = false,
12+
const char *perfstatsfilename = "perf.root");
13+
14+
class RegionQueryFixture : public benchmark::Fixture {
15+
public:
16+
void SetUp(const benchmark::State &state) override
17+
{
18+
num_reads_ = static_cast<int>(state.range(0));
19+
sam_file_ = "region_query_test_" + std::to_string(num_reads_) + ".sam";
20+
21+
GenerateSAMFile(sam_file_, num_reads_);
22+
}
23+
24+
void TearDown(const benchmark::State &) override { std::remove(sam_file_.c_str()); }
25+
26+
protected:
27+
int num_reads_;
28+
std::string sam_file_;
29+
static constexpr const char *region_ = "chr1:1-100000000";
30+
31+
void suppress_output() { freopen(NULL_DEVICE, "w", stdout); }
32+
33+
void restore_output() { freopen("/dev/tty", "w", stdout); }
34+
};
35+
36+
BENCHMARK_DEFINE_F(RegionQueryFixture, TTree)(benchmark::State &state)
37+
{
38+
std::string root_file = "ttree_" + std::to_string(num_reads_) + ".root";
39+
40+
suppress_output();
41+
samtoram(sam_file_.c_str(), root_file.c_str(), true, true, true, 1, 0);
42+
restore_output();
43+
44+
for (auto _ : state) {
45+
suppress_output();
46+
ramview(root_file.c_str(), region_, true, false, "perf.root");
47+
restore_output();
48+
}
49+
50+
std::remove(root_file.c_str());
51+
52+
state.counters["reads_per_sec"] = benchmark::Counter(num_reads_, benchmark::Counter::kIsRate);
53+
}
54+
55+
BENCHMARK_DEFINE_F(RegionQueryFixture, RNTuple)(benchmark::State &state)
56+
{
57+
std::string root_file = "rntuple_" + std::to_string(num_reads_) + ".root";
58+
59+
suppress_output();
60+
samtoramntuple(sam_file_.c_str(), root_file.c_str(), true, true, true, 505, 0);
61+
restore_output();
62+
63+
for (auto _ : state) {
64+
suppress_output();
65+
ramntupleview(root_file.c_str(), region_, true, false, "perf.root");
66+
restore_output();
67+
}
68+
69+
std::remove(root_file.c_str());
70+
71+
state.counters["reads_per_sec"] = benchmark::Counter(num_reads_, benchmark::Counter::kIsRate);
72+
}
73+
74+
BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)
75+
->Args({1000})
76+
->Args({10000})
77+
->Args({100000})
78+
->Unit(benchmark::kMillisecond);
79+
80+
BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)
81+
->Args({1000})
82+
->Args({10000})
83+
->Args({100000})
84+
->Unit(benchmark::kMillisecond);
85+
86+
BENCHMARK_MAIN();

benchmark/sam_to_ram_benchmark.cxx

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,13 @@
11
#include <benchmark/benchmark.h>
22
#include "generate_sam_benchmark.h"
3+
#include "benchmark_utils.h"
34
#include "ramcore/SamToTTree.h"
45
#include "ramcore/SamToNTuple.h"
56
#include <filesystem>
67
#include <iostream>
78
#include <cstdio>
89
#include <cstring>
910

10-
#ifdef _WIN32
11-
#define NULL_DEVICE "NUL"
12-
#else
13-
#define NULL_DEVICE "/dev/null"
14-
#endif
15-
1611
static void BM_SamToRamComparison(benchmark::State &state)
1712
{
1813
int num_reads = state.range(0);

inc/ramcore/SamToNTuple.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
#ifndef RAMCORE_SAMTONTUPLE_H
22
#define RAMCORE_SAMTONTUPLE_H
33

4-
#include <cstdint>
4+
#include <cstdint>
5+
#include <string>
56

67
void samtoramntuple(const char *datafile,
78
const char *treefile,
89
bool index, bool split, bool cache,
910
int compression_algorithm,
1011
uint32_t quality_policy);
1112

12-
#endif // RAMCORE_SAMTONTUPLE_H
13+
void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm,
14+
uint32_t quality_policy);
1315

16+
#endif

src/ramcore/SamToNTuple.cxx

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@
1010
#include <TNamed.h>
1111
#include <TFile.h>
1212

13+
#include <map>
14+
#include <memory>
15+
#include <iostream>
16+
#include <fstream>
17+
#include <cstdio>
18+
1319
void samtoramntuple(const char *datafile,
1420
const char *treefile,
1521
bool index, bool split, bool cache,
@@ -120,3 +126,66 @@ void samtoramntuple(const char *datafile,
120126
stopwatch.Print();
121127
}
122128

129+
void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm,
130+
uint32_t quality_policy)
131+
{
132+
std::ifstream input(datafile);
133+
if (!input) {
134+
std::cerr << "Error: Cannot open " << datafile << std::endl;
135+
return;
136+
}
137+
138+
std::vector<std::string> headers;
139+
std::map<std::string, std::unique_ptr<std::ofstream>> chr_files;
140+
std::map<std::string, std::string> chr_temp_filenames;
141+
std::string line;
142+
143+
while (std::getline(input, line)) {
144+
if (line.empty())
145+
continue;
146+
147+
if (line[0] == '@') {
148+
headers.push_back(line);
149+
continue;
150+
}
151+
152+
size_t pos = line.find('\t');
153+
if (pos == std::string::npos)
154+
continue;
155+
pos = line.find('\t', pos + 1);
156+
if (pos == std::string::npos)
157+
continue;
158+
159+
size_t end_pos = line.find('\t', pos + 1);
160+
if (end_pos == std::string::npos)
161+
continue;
162+
163+
std::string rname = line.substr(pos + 1, end_pos - pos - 1);
164+
if (rname == "*")
165+
continue;
166+
167+
if (chr_files.find(rname) == chr_files.end()) {
168+
std::string temp_filename = std::string(output_prefix) + "_" + rname + ".tmp.sam";
169+
chr_temp_filenames[rname] = temp_filename;
170+
chr_files[rname] = std::make_unique<std::ofstream>(temp_filename);
171+
172+
for (const auto &header : headers) {
173+
*(chr_files[rname]) << header << "\n";
174+
}
175+
}
176+
177+
*(chr_files[rname]) << line << "\n";
178+
}
179+
180+
input.close();
181+
for (auto &[chr, file] : chr_files) {
182+
file->close();
183+
}
184+
185+
for (const auto &[chr, temp_filename] : chr_temp_filenames) {
186+
std::string output_filename = std::string(output_prefix) + "_" + chr + ".root";
187+
samtoramntuple(temp_filename.c_str(), output_filename.c_str(), false, false, false, compression_algorithm,
188+
quality_policy);
189+
std::remove(temp_filename.c_str());
190+
}
191+
}

test/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ function(add_ramcore_test test_name)
2424
endfunction()
2525

2626
add_ramcore_test(ramcoretests ramcoretests.cxx)
27+
add_ramcore_test(chromosome_split_test chromosome_split_test.cxx)
2728

28-
install(TARGETS ramcoretests
29+
install(TARGETS ramcoretests chromosome_split_test
2930
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
3031
)
3132

test/chromosome_split_test.cxx

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#include <gtest/gtest.h>
2+
#include "ramcore/SamToNTuple.h"
3+
#include "rntuple/RAMNTupleRecord.h"
4+
#include "generate_sam_benchmark.h"
5+
#include <ROOT/RNTupleReader.hxx>
6+
#include <cstdio>
7+
#include <filesystem>
8+
9+
class ChromosomeSplitTest : public ::testing::Test {
10+
protected:
11+
void SetUp() override
12+
{
13+
if (!std::filesystem::exists("test.sam")) {
14+
GenerateSAMFile("test.sam", 100);
15+
}
16+
CleanupTestFiles();
17+
}
18+
19+
void TearDown() override { CleanupTestFiles(); }
20+
21+
void CleanupTestFiles()
22+
{
23+
std::remove("test_regular.root");
24+
for (const auto &entry : std::filesystem::directory_iterator(".")) {
25+
std::string filename = entry.path().filename().string();
26+
if (filename.find("test_split_") == 0 && filename.find(".root") != std::string::npos) {
27+
std::remove(filename.c_str());
28+
}
29+
}
30+
}
31+
};
32+
33+
TEST_F(ChromosomeSplitTest, NoDataLoss)
34+
{
35+
samtoramntuple("test.sam", "test_regular.root", false, true, true, 505, 1);
36+
auto regularReader = ROOT::Experimental::RNTupleReader::Open("RAM", "test_regular.root");
37+
Long64_t totalEntries = regularReader->GetNEntries();
38+
39+
samtoramntuple_split_by_chromosome("test.sam", "test_split", 505, 1);
40+
41+
Long64_t splitEntriesSum = 0;
42+
for (const auto &entry : std::filesystem::directory_iterator(".")) {
43+
std::string filename = entry.path().filename().string();
44+
if (filename.find("test_split_") == 0 && filename.find(".root") != std::string::npos) {
45+
auto reader = ROOT::Experimental::RNTupleReader::Open("RAM", filename);
46+
if (reader) {
47+
splitEntriesSum += reader->GetNEntries();
48+
}
49+
}
50+
}
51+
52+
EXPECT_EQ(totalEntries, splitEntriesSum);
53+
}
54+
55+
TEST_F(ChromosomeSplitTest, CorrectChromosomeAssignment)
56+
{
57+
samtoramntuple_split_by_chromosome("test.sam", "test_split", 505, 1);
58+
59+
for (const auto &entry : std::filesystem::directory_iterator(".")) {
60+
std::string filename = entry.path().filename().string();
61+
if (filename.find("test_split_") == 0 && filename.find(".root") != std::string::npos) {
62+
size_t pos = filename.find("test_split_");
63+
size_t end = filename.find(".root");
64+
std::string expectedChr = filename.substr(pos + 11, end - pos - 11);
65+
66+
auto reader = ROOT::Experimental::RNTupleReader::Open("RAM", filename);
67+
ASSERT_NE(reader, nullptr);
68+
69+
auto viewRecord = reader->GetView<RAMNTupleRecord>("record");
70+
71+
for (auto i : reader->GetEntryRange()) {
72+
const auto &record = viewRecord(i);
73+
std::string actualChr = record.GetRNAME();
74+
EXPECT_EQ(expectedChr, actualChr) << "Wrong chromosome in " << filename << " at entry " << i;
75+
}
76+
}
77+
}
78+
}
79+
80+
TEST_F(ChromosomeSplitTest, MetadataPresent)
81+
{
82+
samtoramntuple_split_by_chromosome("test.sam", "test_split", 505, 1);
83+
84+
int filesChecked = 0;
85+
for (const auto &entry : std::filesystem::directory_iterator(".")) {
86+
std::string filename = entry.path().filename().string();
87+
if (filename.find("test_split_") == 0 && filename.find(".root") != std::string::npos) {
88+
auto metaReader = ROOT::Experimental::RNTupleReader::Open("METADATA", filename);
89+
EXPECT_NE(metaReader, nullptr) << "Missing METADATA in " << filename;
90+
91+
if (metaReader) {
92+
EXPECT_GT(metaReader->GetNEntries(), 0);
93+
}
94+
95+
filesChecked++;
96+
}
97+
}
98+
99+
EXPECT_GT(filesChecked, 0);
100+
}

0 commit comments

Comments
 (0)