Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 126 additions & 6 deletions datafusion/functions/benches/trim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,19 @@ impl fmt::Display for TrimType {
/// For ltrim: trim characters are at the start (prefix)
/// For rtrim: trim characters are at the end (suffix)
/// For btrim: trim characters are at both start and end
fn create_string_array_and_characters(
fn create_string_array_and_characters_with_null_rate(
size: usize,
characters: &str,
trimmed: &str,
remaining_len: usize,
string_array_type: StringArrayType,
trim_type: TrimType,
null_rate: f32,
) -> (ArrayRef, ScalarValue) {
let rng = &mut StdRng::seed_from_u64(42);

// Create `size` rows:
// - 10% rows will be `None`
// - Other 90% will be strings with `remaining_len` content length
let string_iter = (0..size).map(|_| {
if rng.random::<f32>() < 0.1 {
if rng.random::<f32>() < null_rate {
None
} else {
let content: String = rng
Expand Down Expand Up @@ -129,13 +127,34 @@ fn create_args(
string_array_type: StringArrayType,
trim_type: TrimType,
) -> Vec<ColumnarValue> {
let (string_array, pattern) = create_string_array_and_characters(
create_args_with_null_rate(
size,
characters,
trimmed,
remaining_len,
string_array_type,
trim_type,
0.1,
)
}

fn create_args_with_null_rate(
size: usize,
characters: &str,
trimmed: &str,
remaining_len: usize,
string_array_type: StringArrayType,
trim_type: TrimType,
null_rate: f32,
) -> Vec<ColumnarValue> {
let (string_array, pattern) = create_string_array_and_characters_with_null_rate(
size,
characters,
trimmed,
remaining_len,
string_array_type,
trim_type,
null_rate,
);
vec![
ColumnarValue::Array(string_array),
Expand Down Expand Up @@ -227,6 +246,107 @@ fn criterion_benchmark(c: &mut Criterion) {
let ltrim = string::ltrim();
let rtrim = string::rtrim();
let btrim = string::btrim();
let config_options = Arc::new(ConfigOptions::default());

// Scalar benchmarks for ltrim (outside size loop)
c.bench_function("ltrim/scalar_utf8", |b| {
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(
"___datafusion___".to_string(),
))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some("_".to_string()))),
],
arg_fields: vec![
Field::new("str", DataType::Utf8, false).into(),
Field::new("trim_str", DataType::Utf8, false).into(),
],
number_rows: 1,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
};
b.iter(|| black_box(ltrim.invoke_with_args(args.clone()).unwrap()))
});

c.bench_function("ltrim/scalar_utf8view", |b| {
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"___datafusion___".to_string(),
))),
ColumnarValue::Scalar(ScalarValue::Utf8View(Some("_".to_string()))),
],
arg_fields: vec![
Field::new("str", DataType::Utf8View, false).into(),
Field::new("trim_str", DataType::Utf8View, false).into(),
],
number_rows: 1,
return_field: Field::new("f", DataType::Utf8View, true).into(),
config_options: Arc::clone(&config_options),
};
b.iter(|| black_box(ltrim.invoke_with_args(args.clone()).unwrap()))
});

// Array benchmarks with no nulls to demonstrate null_count fast path
const N_ROWS_NO_NULLS: usize = 8192;

c.bench_function("ltrim/no_nulls_utf8", |b| {
let args = create_args_with_null_rate(
N_ROWS_NO_NULLS,
"_",
"____",
8,
StringArrayType::Utf8,
TrimType::Ltrim,
0.0,
);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

b.iter(|| {
black_box(ltrim.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
arg_fields: arg_fields.clone(),
number_rows: N_ROWS_NO_NULLS,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
}))
})
});

c.bench_function("ltrim/no_nulls_utf8view", |b| {
let args = create_args_with_null_rate(
N_ROWS_NO_NULLS,
"_",
"____",
8,
StringArrayType::Utf8View,
TrimType::Ltrim,
0.0,
);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

b.iter(|| {
black_box(ltrim.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
arg_fields: arg_fields.clone(),
number_rows: N_ROWS_NO_NULLS,
return_field: Field::new("f", DataType::Utf8View, true).into(),
config_options: Arc::clone(&config_options),
}))
})
});

let characters = ",!()";

Expand Down
152 changes: 113 additions & 39 deletions datafusion/functions/src/string/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,32 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
1 => {
// Default whitespace trim - pattern is just space
let pattern = [' '];
for (src_str_opt, raw_view) in string_view_array
.iter()
.zip(string_view_array.views().iter())
{
trim_and_append_view::<Tr>(
src_str_opt,
&pattern,
&mut views_buf,
&mut null_builder,
raw_view,
);

if string_view_array.null_count() == 0 {
for (i, raw_view) in string_view_array.views().iter().enumerate() {
let src_str = string_view_array.value(i);
let (trimmed, offset) = Tr::trim(src_str, &pattern);
make_and_append_view(
&mut views_buf,
&mut null_builder,
raw_view,
trimmed,
offset,
);
}
} else {
for (src_str_opt, raw_view) in string_view_array
.iter()
.zip(string_view_array.views().iter())
{
trim_and_append_view::<Tr>(
src_str_opt,
&pattern,
&mut views_buf,
&mut null_builder,
raw_view,
);
}
}
}
2 => {
Expand All @@ -127,40 +142,73 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
}

let pattern: Vec<char> = characters_array.value(0).chars().collect();
for (src_str_opt, raw_view) in string_view_array
.iter()
.zip(string_view_array.views().iter())
{
trim_and_append_view::<Tr>(
src_str_opt,
&pattern,
&mut views_buf,
&mut null_builder,
raw_view,
);

if string_view_array.null_count() == 0 {
for (i, raw_view) in string_view_array.views().iter().enumerate() {
let src_str = string_view_array.value(i);
let (trimmed, offset) = Tr::trim(src_str, &pattern);
make_and_append_view(
&mut views_buf,
&mut null_builder,
raw_view,
trimmed,
offset,
);
}
} else {
for (src_str_opt, raw_view) in string_view_array
.iter()
.zip(string_view_array.views().iter())
{
trim_and_append_view::<Tr>(
src_str_opt,
&pattern,
&mut views_buf,
&mut null_builder,
raw_view,
);
}
}
} else {
// Per-row pattern - must compute pattern chars for each row
for ((src_str_opt, raw_view), characters_opt) in string_view_array
.iter()
.zip(string_view_array.views().iter())
.zip(characters_array.iter())
if string_view_array.null_count() == 0
&& characters_array.null_count() == 0
{
if let (Some(src_str), Some(characters)) =
(src_str_opt, characters_opt)
{
let pattern: Vec<char> = characters.chars().collect();
for i in 0..string_view_array.len() {
let pattern: Vec<char> =
characters_array.value(i).chars().collect();
let src_str = string_view_array.value(i);
let (trimmed, offset) = Tr::trim(src_str, &pattern);
make_and_append_view(
&mut views_buf,
&mut null_builder,
raw_view,
&string_view_array.views()[i],
trimmed,
offset,
);
} else {
null_builder.append_null();
views_buf.push(0);
}
} else {
for ((src_str_opt, raw_view), characters_opt) in string_view_array
.iter()
.zip(string_view_array.views().iter())
.zip(characters_array.iter())
{
if let (Some(src_str), Some(characters)) =
(src_str_opt, characters_opt)
{
let pattern: Vec<char> = characters.chars().collect();
let (trimmed, offset) = Tr::trim(src_str, &pattern);
make_and_append_view(
&mut views_buf,
&mut null_builder,
raw_view,
trimmed,
offset,
);
} else {
null_builder.append_null();
views_buf.push(0);
}
}
}
}
Expand Down Expand Up @@ -227,12 +275,19 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: &[ArrayRef]) -> Result<Arr
1 => {
// Default whitespace trim - pattern is just space
let pattern = [' '];
let result = string_array
.iter()
.map(|string| string.map(|s| Tr::trim(s, &pattern).0))
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
if string_array.null_count() == 0 {
let result = (0..string_array.len())
.map(|i| Some(Tr::trim(string_array.value(i), &pattern).0))
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
} else {
let result = string_array
.iter()
.map(|string| string.map(|s| Tr::trim(s, &pattern).0))
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
}
2 => {
let characters_array = as_generic_string_array::<T>(&args[1])?;
Expand All @@ -247,6 +302,14 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: &[ArrayRef]) -> Result<Arr
}

let pattern: Vec<char> = characters_array.value(0).chars().collect();

if string_array.null_count() == 0 {
let result = (0..string_array.len())
.map(|i| Some(Tr::trim(string_array.value(i), &pattern).0))
.collect::<GenericStringArray<T>>();
return Ok(Arc::new(result) as ArrayRef);
}

let result = string_array
.iter()
.map(|item| item.map(|s| Tr::trim(s, &pattern).0))
Expand All @@ -255,6 +318,17 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: &[ArrayRef]) -> Result<Arr
}

// Per-row pattern - must compute pattern chars for each row
if string_array.null_count() == 0 && characters_array.null_count() == 0 {
let result = (0..string_array.len())
.map(|i| {
let pattern: Vec<char> =
characters_array.value(i).chars().collect();
Some(Tr::trim(string_array.value(i), &pattern).0)
})
.collect::<GenericStringArray<T>>();
return Ok(Arc::new(result) as ArrayRef);
}

let result = string_array
.iter()
.zip(characters_array.iter())
Expand Down
Loading