Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
#
# ==============================================================================

.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal qwen3-tts-cpu whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help

help:
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
Expand All @@ -101,6 +101,7 @@ help:
@echo " voxtral_realtime-cuda - Build Voxtral Realtime runner with CUDA backend"
@echo " voxtral_realtime-cpu - Build Voxtral Realtime runner with CPU backend"
@echo " voxtral_realtime-metal - Build Voxtral Realtime runner with Metal backend (macOS only)"
@echo " qwen3-tts-cpu - Build Qwen3-TTS runner with CPU backend"
@echo " whisper-cuda - Build Whisper runner with CUDA backend"
@echo " whisper-cuda-debug - Build Whisper runner with CUDA backend (debug mode)"
@echo " whisper-cpu - Build Whisper runner with CPU backend"
Expand Down Expand Up @@ -264,6 +265,15 @@ voxtral_realtime-cuda:
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"

qwen3-tts-cpu:
@echo "==> Building and installing ExecuTorch..."
cmake --workflow --preset llm-release
@echo "==> Building Qwen3-TTS runner (CPU)..."
cd examples/models/qwen3-tts && cmake --workflow --preset qwen3-tts-cpu
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/qwen3-tts/qwen3_tts_unified_runner"

silero-vad-cpu:
@echo "==> Building and installing ExecuTorch..."
cmake --workflow --preset llm-release
Expand Down
118 changes: 118 additions & 0 deletions examples/models/qwen3-tts/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

cmake_minimum_required(VERSION 3.24)
project(qwen3_tts_runner)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)

# Let files say "include <executorch/path/to/header.h>"
set(_common_include_directories ${EXECUTORCH_ROOT}/..)

set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
find_package(gflags REQUIRED)

list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
executorch_target_link_options_shared_lib(executorch)

set(_link_libraries executorch gflags)

# Common ops for all builds.
list(APPEND _link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)

# CPU path can require quantized/custom ops when XNNPACK delegates are present.
if(NOT EXECUTORCH_BUILD_CUDA)
list(APPEND _link_libraries quantized_ops_lib custom_ops)
executorch_target_link_options_shared_lib(quantized_ops_lib)
executorch_target_link_options_shared_lib(custom_ops)
endif()

# XNNPACK
if(TARGET xnnpack_backend)
set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
if(TARGET kleidiai)
list(APPEND xnnpack_backend_libs kleidiai)
endif()
list(APPEND _link_libraries ${xnnpack_backend_libs})
executorch_target_link_options_shared_lib(xnnpack_backend)
endif()

# Base extensions needed for module loading + tensors.
list(
APPEND
_link_libraries
extension_module
extension_data_loader
extension_tensor
extension_flat_tensor
)

if(ANDROID)
list(APPEND _link_libraries log)
endif()

if(EXECUTORCH_BUILD_CUDA)
find_package(CUDAToolkit REQUIRED)
list(APPEND _link_libraries aoti_cuda_backend)
if(NOT MSVC)
executorch_target_link_options_shared_lib(aoti_cuda_backend)
endif()
endif()

add_executable(qwen3_tts_runner main.cpp qwen3_tts_runner.cpp)
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(qwen3_tts_runner)
if(NOT APPLE AND NOT MSVC)
target_link_options(qwen3_tts_runner PRIVATE "LINKER:-s")
endif()
endif()

target_include_directories(qwen3_tts_runner PUBLIC ${_common_include_directories})
target_link_libraries(qwen3_tts_runner PUBLIC ${_link_libraries})
target_compile_options(qwen3_tts_runner PUBLIC ${_common_compile_options})

# Unified runner: single .pte with all methods (text -> audio).
add_executable(
qwen3_tts_unified_runner main_unified.cpp qwen3_tts_unified_runner.cpp
)
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(qwen3_tts_unified_runner)
if(NOT APPLE AND NOT MSVC)
target_link_options(qwen3_tts_unified_runner PRIVATE "LINKER:-s")
endif()
endif()

target_include_directories(
qwen3_tts_unified_runner PUBLIC ${_common_include_directories}
)
target_link_libraries(
qwen3_tts_unified_runner PUBLIC ${_link_libraries} extension_llm_runner
)

# Metal/AOTI backend for GPU acceleration.
if(EXECUTORCH_BUILD_METAL)
target_link_libraries(qwen3_tts_unified_runner PUBLIC metal_backend)
executorch_target_link_options_shared_lib(metal_backend)
endif()
target_compile_options(
qwen3_tts_unified_runner PUBLIC ${_common_compile_options}
)

if(MSVC AND EXECUTORCH_BUILD_CUDA)
add_custom_command(
TARGET qwen3_tts_runner
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
$<TARGET_FILE_DIR:qwen3_tts_runner>
COMMENT "Copying aoti_cuda_shims.dll to qwen3_tts_runner directory"
)
endif()
48 changes: 48 additions & 0 deletions examples/models/qwen3-tts/CMakePresets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"version": 6,
"configurePresets": [
{
"name": "qwen3-tts-base",
"hidden": true,
"binaryDir": "${sourceDir}/../../../cmake-out/examples/models/qwen3-tts",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
"CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
}
},
{
"name": "qwen3-tts-cpu",
"displayName": "Qwen3-TTS runner (CPU)",
"inherits": [
"qwen3-tts-base"
]
}
],
"buildPresets": [
{
"name": "qwen3-tts-cpu",
"displayName": "Build Qwen3-TTS runner (CPU)",
"configurePreset": "qwen3-tts-cpu",
"targets": [
"qwen3_tts_unified_runner"
]
}
],
"workflowPresets": [
{
"name": "qwen3-tts-cpu",
"displayName": "Configure and build Qwen3-TTS runner (CPU)",
"steps": [
{
"type": "configure",
"name": "qwen3-tts-cpu"
},
{
"type": "build",
"name": "qwen3-tts-cpu"
}
]
}
]
}
129 changes: 129 additions & 0 deletions examples/models/qwen3-tts/CONTEXT.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Qwen3-TTS Bring-up Context

## Scope

- Target model: `Qwen/Qwen3-TTS-12Hz-0.6B-Base`
- Target path: `examples/models/qwen3-tts`
- Backend: XNNPACK (CPU)

## Reference patterns used

### 1) Qwen conversion/export patterns

- `examples/models/qwen3/convert_weights.py`
- HF checkpoint conversion style with shard handling.
- `examples/models/qwen3_5/convert_weights.py`
- strict key mapping behavior and defensive conversion logic.
- `examples/models/qwen3_5/tests/test_convert_weights.py`
- focused conversion unit tests for mapping and unknown keys.

### 2) Speech model export/runtime patterns

- `examples/models/voxtral_realtime/export_voxtral_rt.py`
- multi-method export wrappers.
- backend split and metadata in `constant_methods`.
- `examples/models/voxtral_realtime/voxtral_realtime_runner.cpp`
- custom C++ runner using `executorch::extension::Module`.
- `examples/models/whisper/main.cpp`
- ASR runtime ergonomics and preprocessor handoff.

### 3) Build integration patterns

- `examples/models/whisper/CMakeLists.txt`
- `examples/models/whisper/CMakePresets.json`
- top-level `Makefile`

### 4) Backend support references

- `examples/models/MODEL_BACKEND_SUPPORT.md`
- confirms XNNPACK as the practical first backend target for CPU bring-up.
- speech model examples currently emphasize CUDA/Metal; this bring-up closes a
gap for CPU-oriented TTS decode execution.

## Repository observations (examples/models survey)

- Existing audio examples are STT-focused (`whisper`, `parakeet`, `voxtral_realtime`).
- No first-class generic TTS runner existed before this bring-up.
- Existing reusable primitive for speech output generation is closest in
tokenizer/codec decoder stacks (not yet standardized as a shared TTS runtime).

## Qwen3-TTS package observations

- `Qwen3TTSModel.generate_voice_clone(...)` performs:
- text/ref prompt packing,
- talker generation of codec tokens,
- speech tokenizer decode into waveform.
- Speech tokenizer decode path for 12Hz variant is represented by
`Qwen3TTSTokenizerV2Decoder` and can run from codebook tokens.
- Full talker generation export to ExecuTorch is significantly larger in scope
(autoregressive + sub-talker generation path and cache/state flow).

## Bring-up design choice

To get XNNPACK validation first:

- Export the **speech-tokenizer decoder** into ExecuTorch.
- Keep **codec generation** in Python helper using upstream `qwen_tts`.
- Add a C++ runner that:
- optionally invokes helper (`text -> codec ids`)
- then decodes codec ids through exported `model.pte` (`codec ids -> wav`).

This keeps the path runnable and measurable while preserving room to move
talker generation into ExecuTorch in a follow-up phase.

## Implemented architecture map

### Conversion layer

- `convert_weights.py`
- pulls local or remote HF snapshots.
- reads safetensor shards and extracts:
- speech decoder weights (`decoder.*` from `speech_tokenizer/`)
- optional talker weights (`talker.*` from root model)
- writes `decoder_metadata.json` for export/runtime contracts.

### Export layer

- `model.py`
- defines `Qwen3TTSSpeechDecoderExport` wrapper.
- computes output lengths from codec tokens and runs decoder forward.
- `export_qwen3_tts.py`
- lowers wrapper to ExecuTorch.
- attaches `constant_methods` metadata:
- `output_sample_rate`
- `decode_upsample_rate`
- `num_quantizers`
- `codebook_size`
- `fixed_codes_len`
- supports fp32/bf16 and optional 8da4w quant for linear layers.

### Runtime layer

- `generate_codes.py`
- uses upstream `Qwen3TTSModel` for text->codec generation.
- supports:
- text-only mode (fallback x-vector prompt from generated silence)
- voice clone mode (`ref_audio` + optional `ref_text`)
- emits compact binary codec file consumed by C++ runner.
- `qwen3_tts_runner.cpp`
- loads exported decoder `.pte`.
- optionally invokes helper script for codec generation.
- pads codec sequence to `fixed_codes_len` and decodes waveform.
- writes PCM16 WAV output.

## Why fixed-length export is used

- Initial dynamic-shape export failed with `torch.export` constraint violations
on `codes_len` for decoder internals.
- Static export (`fixed_codes_len=1200`) was adopted to unblock XNNPACK
execution.
- Runner-side padding with sentinel `-1` preserves true output trimming through
decoder length metadata.

## Follow-up work suggested by this bring-up

1. Move talker autoregressive generation into ExecuTorch methods
(prefill/decode-step style).
2. Investigate BF16 decode runtime stall observed in current experiments.
3. Add Metal backend support for the speech decoder.
4. Replace helper-script dependency with fully in-runner ExecuTorch graph path.
Loading
Loading