Skip to content

Commit 840e33d

Browse files
[Enhancement] Optimize CSV parsing with memchr (backport #63715) (#65973)
Co-authored-by: nancodex <[email protected]>
1 parent 1052b5d commit 840e33d

File tree

3 files changed

+240
-11
lines changed

3 files changed

+240
-11
lines changed

be/src/formats/csv/csv_reader.cpp

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -555,15 +555,41 @@ void CSVReader::split_record(const Record& record, Fields* columns) const {
555555
const size_t size = record.size;
556556

557557
if (_column_delimiter_length == 1) {
558-
for (size_t i = 0; i < size; ++i, ++ptr) {
559-
if (*ptr == _parse_options.column_delimiter[0]) {
558+
// Optimized: use memchr for SIMD-optimized character search
559+
const char delimiter = _parse_options.column_delimiter[0];
560+
const char* end = record.data + size;
561+
562+
// Handle empty string case
563+
if (size == 0) {
564+
columns->emplace_back("", 0);
565+
return;
566+
}
567+
568+
while (ptr <= end) {
569+
const char* next_delimiter = nullptr;
570+
if (ptr < end) {
571+
next_delimiter = static_cast<const char*>(memchr(ptr, delimiter, end - ptr));
572+
}
573+
574+
if (next_delimiter == nullptr) {
575+
// No more delimiters found, add the remaining part
560576
if (_parse_options.trim_space) {
561-
std::pair<const char*, size_t> newPos = trim(value, ptr - value);
577+
std::pair<const char*, size_t> newPos = trim(value, end - value);
562578
columns->emplace_back(newPos.first, newPos.second);
563579
} else {
564-
columns->emplace_back(value, ptr - value);
580+
columns->emplace_back(value, end - value);
565581
}
566-
value = ptr + 1;
582+
break;
583+
} else {
584+
// Found delimiter, add the field
585+
if (_parse_options.trim_space) {
586+
std::pair<const char*, size_t> newPos = trim(value, next_delimiter - value);
587+
columns->emplace_back(newPos.first, newPos.second);
588+
} else {
589+
columns->emplace_back(value, next_delimiter - value);
590+
}
591+
value = next_delimiter + 1;
592+
ptr = next_delimiter + 1;
567593
}
568594
}
569595
} else {
@@ -584,12 +610,14 @@ void CSVReader::split_record(const Record& record, Fields* columns) const {
584610
} while (ptr != nullptr);
585611

586612
ptr = record.data + size;
587-
}
588-
if (_parse_options.trim_space) {
589-
std::pair<const char*, size_t> newPos = trim(value, ptr - value);
590-
columns->emplace_back(newPos.first, newPos.second);
591-
} else {
592-
columns->emplace_back(value, ptr - value);
613+
614+
// Add the last field for multi-character delimiter case
615+
if (_parse_options.trim_space) {
616+
std::pair<const char*, size_t> newPos = trim(value, ptr - value);
617+
columns->emplace_back(newPos.first, newPos.second);
618+
} else {
619+
columns->emplace_back(value, ptr - value);
620+
}
593621
}
594622
}
595623

be/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ set(EXEC_FILES
204204
./formats/csv/default_value_converter_test.cpp
205205
./formats/csv/string_converter_test.cpp
206206
./formats/csv/varbinary_converter_test.cpp
207+
./formats/csv/csv_reader_test.cpp
207208
./formats/json/binary_column_test.cpp
208209
./formats/json/numeric_column_test.cpp
209210
./formats/json/nullable_column_test.cpp
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
// Copyright 2021-present StarRocks, Inc. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// https://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "formats/csv/csv_reader.h"
16+
17+
#include <gtest/gtest.h>
18+
19+
#include "runtime/types.h"
20+
21+
namespace starrocks::csv {
22+
23+
// Mock CSVReader for testing - implements the pure virtual function
24+
class MockCSVReader : public starrocks::CSVReader {
25+
public:
26+
explicit MockCSVReader(const starrocks::CSVParseOptions& parse_options) : CSVReader(parse_options) {}
27+
28+
protected:
29+
starrocks::Status _fill_buffer() override {
30+
// Mock implementation - not needed for split_record tests
31+
return starrocks::Status::OK();
32+
}
33+
34+
char* _find_line_delimiter(starrocks::CSVBuffer& buffer, size_t pos) override {
35+
// Mock implementation - not needed for split_record tests
36+
return nullptr;
37+
}
38+
};
39+
40+
class CSVReaderTest : public ::testing::Test {
41+
public:
42+
CSVReaderTest() = default;
43+
44+
protected:
45+
void SetUp() override {
46+
_parse_options.column_delimiter = ",";
47+
_parse_options.row_delimiter = "\n";
48+
_parse_options.trim_space = false;
49+
}
50+
51+
starrocks::CSVParseOptions _parse_options;
52+
};
53+
54+
// NOLINTNEXTLINE
55+
TEST_F(CSVReaderTest, test_split_record_single_delimiter) {
56+
MockCSVReader reader(_parse_options);
57+
58+
// Test basic splitting
59+
starrocks::CSVReader::Record record1{"a,b,c", 5};
60+
starrocks::CSVReader::Fields fields1;
61+
reader.split_record(record1, &fields1);
62+
63+
EXPECT_EQ(3, fields1.size());
64+
EXPECT_EQ("a", fields1[0].to_string());
65+
EXPECT_EQ("b", fields1[1].to_string());
66+
EXPECT_EQ("c", fields1[2].to_string());
67+
}
68+
69+
// NOLINTNEXTLINE
70+
TEST_F(CSVReaderTest, test_split_record_empty_fields) {
71+
MockCSVReader reader(_parse_options);
72+
73+
// Test empty fields
74+
starrocks::CSVReader::Record record1{",,", 2};
75+
starrocks::CSVReader::Fields fields1;
76+
reader.split_record(record1, &fields1);
77+
78+
EXPECT_EQ(3, fields1.size());
79+
EXPECT_EQ("", fields1[0].to_string());
80+
EXPECT_EQ("", fields1[1].to_string());
81+
EXPECT_EQ("", fields1[2].to_string());
82+
}
83+
84+
// NOLINTNEXTLINE
85+
TEST_F(CSVReaderTest, test_split_record_ends_with_delimiter) {
86+
MockCSVReader reader(_parse_options);
87+
88+
// Test string ending with delimiter
89+
starrocks::CSVReader::Record record1{"a,b,", 4};
90+
starrocks::CSVReader::Fields fields1;
91+
reader.split_record(record1, &fields1);
92+
93+
EXPECT_EQ(3, fields1.size());
94+
EXPECT_EQ("a", fields1[0].to_string());
95+
EXPECT_EQ("b", fields1[1].to_string());
96+
EXPECT_EQ("", fields1[2].to_string());
97+
}
98+
99+
// NOLINTNEXTLINE
100+
TEST_F(CSVReaderTest, test_split_record_starts_with_delimiter) {
101+
MockCSVReader reader(_parse_options);
102+
103+
// Test string starting with delimiter
104+
starrocks::CSVReader::Record record1{",a,b", 4};
105+
starrocks::CSVReader::Fields fields1;
106+
reader.split_record(record1, &fields1);
107+
108+
EXPECT_EQ(3, fields1.size());
109+
EXPECT_EQ("", fields1[0].to_string());
110+
EXPECT_EQ("a", fields1[1].to_string());
111+
EXPECT_EQ("b", fields1[2].to_string());
112+
}
113+
114+
// NOLINTNEXTLINE
115+
TEST_F(CSVReaderTest, test_split_record_single_field) {
116+
MockCSVReader reader(_parse_options);
117+
118+
// Test single field (no delimiters)
119+
starrocks::CSVReader::Record record1{"single_field", 12};
120+
starrocks::CSVReader::Fields fields1;
121+
reader.split_record(record1, &fields1);
122+
123+
EXPECT_EQ(1, fields1.size());
124+
EXPECT_EQ("single_field", fields1[0].to_string());
125+
}
126+
127+
// NOLINTNEXTLINE
128+
TEST_F(CSVReaderTest, test_split_record_empty_string) {
129+
MockCSVReader reader(_parse_options);
130+
131+
// Test empty string
132+
starrocks::CSVReader::Record record1{"", 0};
133+
starrocks::CSVReader::Fields fields1;
134+
reader.split_record(record1, &fields1);
135+
136+
EXPECT_EQ(1, fields1.size());
137+
EXPECT_EQ("", fields1[0].to_string());
138+
}
139+
140+
// NOLINTNEXTLINE
141+
TEST_F(CSVReaderTest, test_split_record_multi_character_delimiter) {
142+
starrocks::CSVParseOptions options;
143+
options.column_delimiter = "||";
144+
options.row_delimiter = "\n";
145+
options.trim_space = false;
146+
147+
MockCSVReader reader(options);
148+
149+
// Test multi-character delimiter
150+
starrocks::CSVReader::Record record1{"a||b||c", 7};
151+
starrocks::CSVReader::Fields fields1;
152+
reader.split_record(record1, &fields1);
153+
154+
EXPECT_EQ(3, fields1.size());
155+
EXPECT_EQ("a", fields1[0].to_string());
156+
EXPECT_EQ("b", fields1[1].to_string());
157+
EXPECT_EQ("c", fields1[2].to_string());
158+
}
159+
160+
// NOLINTNEXTLINE
161+
TEST_F(CSVReaderTest, test_split_record_with_trim_space) {
162+
starrocks::CSVParseOptions options;
163+
options.column_delimiter = ",";
164+
options.row_delimiter = "\n";
165+
options.trim_space = true;
166+
167+
MockCSVReader reader(options);
168+
169+
// Test with trim_space enabled
170+
starrocks::CSVReader::Record record1{" a , b , c ", 11};
171+
starrocks::CSVReader::Fields fields1;
172+
reader.split_record(record1, &fields1);
173+
174+
EXPECT_EQ(3, fields1.size());
175+
EXPECT_EQ("a", fields1[0].to_string());
176+
EXPECT_EQ("b", fields1[1].to_string());
177+
EXPECT_EQ("c", fields1[2].to_string());
178+
}
179+
180+
// NOLINTNEXTLINE
181+
TEST_F(CSVReaderTest, test_split_record_large_data) {
182+
MockCSVReader reader(_parse_options);
183+
184+
// Test with larger data to verify performance optimization
185+
std::string large_data;
186+
for (int i = 0; i < 1000; ++i) {
187+
if (i > 0) large_data += ",";
188+
large_data += "field" + std::to_string(i);
189+
}
190+
191+
starrocks::CSVReader::Record record1{large_data.c_str(), large_data.size()};
192+
starrocks::CSVReader::Fields fields1;
193+
reader.split_record(record1, &fields1);
194+
195+
EXPECT_EQ(1000, fields1.size());
196+
EXPECT_EQ("field0", fields1[0].to_string());
197+
EXPECT_EQ("field999", fields1[999].to_string());
198+
}
199+
200+
} // namespace starrocks::csv

0 commit comments

Comments
 (0)