Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -296,18 +296,21 @@ voxtral_realtime-cuda:
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"

silero-vad-cpu:
@echo "==> Building and installing ExecuTorch..."
cmake --workflow --preset llm-release
@echo "==> Building Silero VAD runner (CPU)..."
@echo "==> Configuring and installing ExecuTorch (without LLM runner)..."
cmake --preset llm-release -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=OFF
cmake --build cmake-out --parallel "$$(sysctl -n hw.ncpu)"
cmake --install cmake-out
@echo "==> Building Silero VAD runners (CPU)..."
cmake -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_FIND_ROOT_PATH=$(CURDIR)/cmake-out \
-DCMAKE_PREFIX_PATH=$(CURDIR)/cmake-out \
-S examples/models/silero_vad \
-B cmake-out/examples/models/silero_vad
cmake --build cmake-out/examples/models/silero_vad --target silero_vad_runner
cmake --build cmake-out/examples/models/silero_vad --target silero_vad_runner silero_vad_stream_runner
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/silero_vad/silero_vad_runner"
@echo " Binary: cmake-out/examples/models/silero_vad/silero_vad_stream_runner"

llama-cpu:
@echo "==> Building and installing ExecuTorch..."
Expand Down
42 changes: 35 additions & 7 deletions examples/models/silero_vad/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ if(NOT _executorch_imported)
executorch_target_link_options_shared_lib(executorch)
endif()

set(link_libraries executorch gflags)
set(common_link_libraries executorch gflags)

# Common ops for all builds
if(TARGET optimized_native_cpu_ops_lib)
list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
list(APPEND common_link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
get_target_property(_is_imported optimized_native_cpu_ops_lib IMPORTED)
if(NOT _is_imported)
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
Expand All @@ -46,7 +46,7 @@ if(TARGET xnnpack_backend)
if(TARGET kleidiai)
list(APPEND xnnpack_backend_libs kleidiai)
endif()
list(APPEND link_libraries ${xnnpack_backend_libs})
list(APPEND common_link_libraries ${xnnpack_backend_libs})
get_target_property(_xnnpack_imported xnnpack_backend IMPORTED)
if(NOT _xnnpack_imported)
executorch_target_link_options_shared_lib(xnnpack_backend)
Expand All @@ -55,14 +55,24 @@ endif()

# Needed for cpuinfo where it uses android specific log lib
if(ANDROID)
list(APPEND link_libraries log)
list(APPEND common_link_libraries log)
endif()

# Add the required ExecuTorch extensions
set(silero_runner_link_libraries ${common_link_libraries})
list(
APPEND
link_libraries
extension_llm_runner
silero_runner_link_libraries
extension_module
extension_data_loader
extension_tensor
extension_flat_tensor
)

set(silero_stream_link_libraries ${common_link_libraries})
list(
APPEND
silero_stream_link_libraries
extension_module
extension_data_loader
extension_tensor
Expand All @@ -80,5 +90,23 @@ endif()
target_include_directories(
silero_vad_runner PUBLIC ${_common_include_directories}
)
target_link_libraries(silero_vad_runner PUBLIC ${link_libraries})
target_link_libraries(silero_vad_runner PUBLIC ${silero_runner_link_libraries})
target_compile_options(silero_vad_runner PUBLIC ${_common_compile_options})

add_executable(silero_vad_stream_runner stream_main.cpp silero_vad_runner.cpp)
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(silero_vad_stream_runner)
if(NOT APPLE AND NOT MSVC)
target_link_options(silero_vad_stream_runner PRIVATE "LINKER:-s")
endif()
endif()

target_include_directories(
silero_vad_stream_runner PUBLIC ${_common_include_directories}
)
target_link_libraries(
silero_vad_stream_runner PUBLIC ${silero_stream_link_libraries}
)
target_compile_options(
silero_vad_stream_runner PUBLIC ${_common_compile_options}
)
45 changes: 38 additions & 7 deletions examples/models/silero_vad/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@ Voice activity detection answers "when is someone speaking" — the model output
```bash
# Export to .pte
cd examples/models/silero_vad
python export_silero_vad.py --jit-model /path/to/silero_vad.jit
python export_silero_vad.py \
--jit-model /path/to/silero_vad.jit \
--backend xnnpack \
--output-dir ./silero_vad_xnnpack

# Build the C++ runner (from repo root)
# Build the C++ runners (from repo root)
make silero-vad-cpu

# Run VAD
# Run WAV-based VAD
./cmake-out/examples/models/silero_vad/silero_vad_runner \
--model_path examples/models/silero_vad/silero_vad_exports/silero_vad.pte \
--model_path examples/models/silero_vad/silero_vad_xnnpack/silero_vad.pte \
--audio_path /path/to/audio.wav
```

Expand Down Expand Up @@ -67,9 +70,9 @@ python export_silero_vad.py --jit-model /path/to/silero-vad/src/silero_vad/data/
| `--backend` | `portable` or `xnnpack` (default: `xnnpack`) |
| `--output-dir` | Output directory (default: `./silero_vad_exports`) |

Output: `silero_vad_exports/silero_vad.pte` (~2 MB).
Output: `silero_vad_xnnpack/silero_vad.pte` (~1.2 MB with XNNPACK, may vary by export settings).

## C++ Runner
## C++ Runners

### Build

Expand All @@ -79,16 +82,44 @@ From the repository root:
make silero-vad-cpu
```

Binary: `cmake-out/examples/models/silero_vad/silero_vad_runner`
This builds:

- `cmake-out/examples/models/silero_vad/silero_vad_runner`
- `cmake-out/examples/models/silero_vad/silero_vad_stream_runner`

The build configures and installs ExecuTorch first, then builds the two Silero VAD binaries from `examples/models/silero_vad/`.

### Arguments

#### `silero_vad_runner`

| Argument | Description |
|----------|-------------|
| `--model_path` | Path to `.pte` file (default: `silero_vad.pte`) |
| `--audio_path` | Path to input WAV file (16kHz mono, required) |
| `--threshold` | Speech probability threshold, 0.0–1.0 (default: `0.5`) |

#### `silero_vad_stream_runner`

| Argument | Description |
|----------|-------------|
| `--model_path` | Path to `.pte` file (default: `silero_vad.pte`) |

The stream runner reads 16kHz mono `float32` PCM from `stdin` and prints:

```text
READY
PROB <time_seconds> <probability>
```

Example:

```bash
ffmpeg -i input.wav -ar 16000 -ac 1 -f f32le -nostats -loglevel error pipe:1 | \
./cmake-out/examples/models/silero_vad/silero_vad_stream_runner \
--model_path examples/models/silero_vad/silero_vad_xnnpack/silero_vad.pte
```

### How It Works

The model processes audio in 512-sample chunks (32ms at 16kHz). Each chunk is prepended with 64 samples of context from the previous chunk, forming a 576-sample input. The model carries an LSTM hidden state across chunks and outputs a single speech probability per chunk.
Expand Down
156 changes: 76 additions & 80 deletions examples/models/silero_vad/silero_vad_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,88 @@ SileroVadRunner::SileroVadRunner(const std::string& model_path) {
context_size_ = cs.ok() ? cs.get()[0].toInt() : 64;
input_size_ = window_size_ + context_size_;
frame_duration_ = static_cast<double>(window_size_) / sample_rate_;
reset_stream();
}

void SileroVadRunner::reset_stream() {
stream_state_data_.assign(static_cast<size_t>(2 * kHiddenDim), 0.0f);
stream_context_.assign(static_cast<size_t>(context_size_), 0.0f);
stream_input_.assign(static_cast<size_t>(input_size_), 0.0f);
stream_frame_index_ = 0;
}

float SileroVadRunner::process_frame(
const float* audio_data,
int64_t num_samples) {
int64_t chunk_len = std::min(window_size_, num_samples);

std::memcpy(
stream_input_.data(),
stream_context_.data(),
static_cast<size_t>(context_size_) * sizeof(float));

if (chunk_len > 0) {
std::memcpy(
stream_input_.data() + context_size_,
audio_data,
static_cast<size_t>(chunk_len) * sizeof(float));
}
if (chunk_len < window_size_) {
std::memset(
stream_input_.data() + context_size_ + chunk_len,
0,
static_cast<size_t>(window_size_ - chunk_len) * sizeof(float));
}

auto input_tensor = from_blob(
stream_input_.data(),
{1, static_cast<::executorch::aten::SizesType>(input_size_)},
::executorch::aten::ScalarType::Float);
auto state_tensor = from_blob(
stream_state_data_.data(),
{2, 1, static_cast<::executorch::aten::SizesType>(kHiddenDim)},
::executorch::aten::ScalarType::Float);

auto result = model_->execute(
"forward", std::vector<EValue>{input_tensor, state_tensor});
ET_CHECK_MSG(result.ok(), "Silero VAD forward failed.");

auto& outputs = result.get();
float prob = outputs[0].toTensor().const_data_ptr<float>()[0];

auto new_state = outputs[1].toTensor();
std::memcpy(
stream_state_data_.data(),
new_state.const_data_ptr<float>(),
static_cast<size_t>(2 * kHiddenDim) * sizeof(float));

if (chunk_len >= context_size_) {
std::memcpy(
stream_context_.data(),
audio_data + chunk_len - context_size_,
static_cast<size_t>(context_size_) * sizeof(float));
} else if (chunk_len > 0) {
int64_t keep = context_size_ - chunk_len;
std::memmove(
stream_context_.data(),
stream_context_.data() + chunk_len,
static_cast<size_t>(keep) * sizeof(float));
std::memcpy(
stream_context_.data() + keep,
audio_data,
static_cast<size_t>(chunk_len) * sizeof(float));
}

stream_frame_index_++;
return prob;
}

SileroVadRunner::Result SileroVadRunner::detect(
const float* audio_data,
int64_t num_samples,
float threshold,
SegmentCallback segment_cb) {
// LSTM state: (2, 1, 128) — [h, c]
std::vector<float> state_data(static_cast<size_t>(2 * kHiddenDim), 0.0f);

// Context: previous chunk's last context_size_ samples
std::vector<float> context(static_cast<size_t>(context_size_), 0.0f);

// Input buffer: [context | chunk] = input_size_ samples
std::vector<float> input(static_cast<size_t>(input_size_));
reset_stream();

bool speech_active = false;
int64_t speech_start_frame = 0;
Expand All @@ -66,78 +133,7 @@ SileroVadRunner::Result SileroVadRunner::detect(
int num_segments = 0;

for (int64_t offset = 0; offset < num_samples; offset += window_size_) {
int64_t chunk_len = std::min(window_size_, num_samples - offset);

// Build input: [context | chunk]
std::memcpy(
input.data(),
context.data(),
static_cast<size_t>(context_size_) * sizeof(float));

if (chunk_len == window_size_) {
std::memcpy(
input.data() + context_size_,
audio_data + offset,
static_cast<size_t>(window_size_) * sizeof(float));
} else {
// Pad the last partial chunk with zeros
std::memcpy(
input.data() + context_size_,
audio_data + offset,
static_cast<size_t>(chunk_len) * sizeof(float));
std::memset(
input.data() + context_size_ + chunk_len,
0,
static_cast<size_t>(window_size_ - chunk_len) * sizeof(float));
}

auto input_tensor = from_blob(
input.data(),
{1, static_cast<::executorch::aten::SizesType>(input_size_)},
::executorch::aten::ScalarType::Float);
auto state_tensor = from_blob(
state_data.data(),
{2, 1, static_cast<::executorch::aten::SizesType>(kHiddenDim)},
::executorch::aten::ScalarType::Float);

auto result = model_->execute(
"forward", std::vector<EValue>{input_tensor, state_tensor});
if (!result.ok()) {
ET_LOG(
Error,
"forward failed at offset %lld.",
static_cast<long long>(offset));
break;
}

auto& outputs = result.get();
float prob = outputs[0].toTensor().const_data_ptr<float>()[0];

// Update LSTM state
auto new_state = outputs[1].toTensor();
std::memcpy(
state_data.data(),
new_state.const_data_ptr<float>(),
static_cast<size_t>(2 * kHiddenDim) * sizeof(float));

// Update context from current chunk
if (chunk_len >= context_size_) {
std::memcpy(
context.data(),
audio_data + offset + chunk_len - context_size_,
static_cast<size_t>(context_size_) * sizeof(float));
} else {
// Shift existing context and append partial chunk
int64_t keep = context_size_ - chunk_len;
std::memmove(
context.data(),
context.data() + chunk_len,
static_cast<size_t>(keep) * sizeof(float));
std::memcpy(
context.data() + keep,
audio_data + offset,
static_cast<size_t>(chunk_len) * sizeof(float));
}
float prob = process_frame(audio_data + offset, num_samples - offset);

// Threshold-based speech detection
if (prob > threshold) {
Expand Down
Loading
Loading