pytorch · seyeong-han · Mar 14, 2026 · Mar 18, 2026 · Mar 19, 2026 · Mar 24, 2026
diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal qwen3-tts-cpu whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -101,6 +101,7 @@ help:
 	@echo "  voxtral_realtime-cuda - Build Voxtral Realtime runner with CUDA backend"
 	@echo "  voxtral_realtime-cpu - Build Voxtral Realtime runner with CPU backend"
 	@echo "  voxtral_realtime-metal - Build Voxtral Realtime runner with Metal backend (macOS only)"
+	@echo "  qwen3-tts-cpu      - Build Qwen3-TTS runner with CPU backend"
 	@echo "  whisper-cuda        - Build Whisper runner with CUDA backend"
 	@echo "  whisper-cuda-debug  - Build Whisper runner with CUDA backend (debug mode)"
 	@echo "  whisper-cpu         - Build Whisper runner with CPU backend"
@@ -264,6 +265,15 @@ voxtral_realtime-cuda:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
 
+qwen3-tts-cpu:
+	@echo "==> Building and installing ExecuTorch..."
+	cmake --workflow --preset llm-release
+	@echo "==> Building Qwen3-TTS runner (CPU)..."
+	cd examples/models/qwen3-tts && cmake --workflow --preset qwen3-tts-cpu
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/qwen3-tts/qwen3_tts_unified_runner"
+
 silero-vad-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release

@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.24)
+project(qwen3_tts_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(_link_libraries executorch gflags)
+
+# Common ops for all builds.
+list(APPEND _link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+
+# CPU path can require quantized/custom ops when XNNPACK delegates are present.
+if(NOT EXECUTORCH_BUILD_CUDA)
+  list(APPEND _link_libraries quantized_ops_lib custom_ops)
+  executorch_target_link_options_shared_lib(quantized_ops_lib)
+  executorch_target_link_options_shared_lib(custom_ops)
+endif()
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND _link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Base extensions needed for module loading + tensors.
+list(
+  APPEND
+  _link_libraries
+  extension_module
+  extension_data_loader
+  extension_tensor
+  extension_flat_tensor
+)
+
+if(ANDROID)
+  list(APPEND _link_libraries log)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND _link_libraries aoti_cuda_backend)
+  if(NOT MSVC)
+    executorch_target_link_options_shared_lib(aoti_cuda_backend)
+  endif()
+endif()
+
+add_executable(qwen3_tts_runner main.cpp qwen3_tts_runner.cpp)
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(qwen3_tts_runner)
+  if(NOT APPLE AND NOT MSVC)
+    target_link_options(qwen3_tts_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(qwen3_tts_runner PUBLIC ${_common_include_directories})
+target_link_libraries(qwen3_tts_runner PUBLIC ${_link_libraries})
+target_compile_options(qwen3_tts_runner PUBLIC ${_common_compile_options})
+
+# Unified runner: single .pte with all methods (text -> audio).
+add_executable(
+  qwen3_tts_unified_runner main_unified.cpp qwen3_tts_unified_runner.cpp
+)
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(qwen3_tts_unified_runner)
+  if(NOT APPLE AND NOT MSVC)
+    target_link_options(qwen3_tts_unified_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(
+  qwen3_tts_unified_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(
+  qwen3_tts_unified_runner PUBLIC ${_link_libraries} extension_llm_runner
+)
+
+# Metal/AOTI backend for GPU acceleration.
+if(EXECUTORCH_BUILD_METAL)
+  target_link_libraries(qwen3_tts_unified_runner PUBLIC metal_backend)
+  executorch_target_link_options_shared_lib(metal_backend)
+endif()
+target_compile_options(
+  qwen3_tts_unified_runner PUBLIC ${_common_compile_options}
+)
+
+if(MSVC AND EXECUTORCH_BUILD_CUDA)
+  add_custom_command(
+    TARGET qwen3_tts_runner
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
+            $<TARGET_FILE_DIR:qwen3_tts_runner>
+    COMMENT "Copying aoti_cuda_shims.dll to qwen3_tts_runner directory"
+  )
+endif()
@@ -0,0 +1,48 @@
+{
+  "version": 6,
+  "configurePresets": [
+    {
+      "name": "qwen3-tts-base",
+      "hidden": true,
+      "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/qwen3-tts",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+        "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
+      }
+    },
+    {
+      "name": "qwen3-tts-cpu",
+      "displayName": "Qwen3-TTS runner (CPU)",
+      "inherits": [
+        "qwen3-tts-base"
+      ]
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "qwen3-tts-cpu",
+      "displayName": "Build Qwen3-TTS runner (CPU)",
+      "configurePreset": "qwen3-tts-cpu",
+      "targets": [
+        "qwen3_tts_unified_runner"
+      ]
+    }
+  ],
+  "workflowPresets": [
+    {
+      "name": "qwen3-tts-cpu",
+      "displayName": "Configure and build Qwen3-TTS runner (CPU)",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "qwen3-tts-cpu"
+        },
+        {
+          "type": "build",
+          "name": "qwen3-tts-cpu"
+        }
+      ]
+    }
+  ]
+}
@@ -0,0 +1,129 @@
+# Qwen3-TTS Bring-up Context
+
+## Scope
+
+- Target model: `Qwen/Qwen3-TTS-12Hz-0.6B-Base`
+- Target path: `examples/models/qwen3-tts`
+- Backend: XNNPACK (CPU)
+
+## Reference patterns used
+
+### 1) Qwen conversion/export patterns
+
+- `examples/models/qwen3/convert_weights.py`
+  - HF checkpoint conversion style with shard handling.
+- `examples/models/qwen3_5/convert_weights.py`
+  - strict key mapping behavior and defensive conversion logic.
+- `examples/models/qwen3_5/tests/test_convert_weights.py`
+  - focused conversion unit tests for mapping and unknown keys.
+
+### 2) Speech model export/runtime patterns
+
+- `examples/models/voxtral_realtime/export_voxtral_rt.py`
+  - multi-method export wrappers.
+  - backend split and metadata in `constant_methods`.
+- `examples/models/voxtral_realtime/voxtral_realtime_runner.cpp`
+  - custom C++ runner using `executorch::extension::Module`.
+- `examples/models/whisper/main.cpp`
+  - ASR runtime ergonomics and preprocessor handoff.
+
+### 3) Build integration patterns
+
+- `examples/models/whisper/CMakeLists.txt`
+- `examples/models/whisper/CMakePresets.json`
+- top-level `Makefile`
+
+### 4) Backend support references
+
+- `examples/models/MODEL_BACKEND_SUPPORT.md`
+  - confirms XNNPACK as the practical first backend target for CPU bring-up.
+  - speech model examples currently emphasize CUDA/Metal; this bring-up closes a
+    gap for CPU-oriented TTS decode execution.
+
+## Repository observations (examples/models survey)
+
+- Existing audio examples are STT-focused (`whisper`, `parakeet`, `voxtral_realtime`).
+- No first-class generic TTS runner existed before this bring-up.
+- Existing reusable primitive for speech output generation is closest in
+  tokenizer/codec decoder stacks (not yet standardized as a shared TTS runtime).
+
+## Qwen3-TTS package observations
+
+- `Qwen3TTSModel.generate_voice_clone(...)` performs:
+  - text/ref prompt packing,
+  - talker generation of codec tokens,
+  - speech tokenizer decode into waveform.
+- Speech tokenizer decode path for 12Hz variant is represented by
+  `Qwen3TTSTokenizerV2Decoder` and can run from codebook tokens.
+- Full talker generation export to ExecuTorch is significantly larger in scope
+  (autoregressive + sub-talker generation path and cache/state flow).
+
+## Bring-up design choice
+
+To get XNNPACK validation first:
+
+- Export the **speech-tokenizer decoder** into ExecuTorch.
+- Keep **codec generation** in Python helper using upstream `qwen_tts`.
+- Add a C++ runner that:
+  - optionally invokes helper (`text -> codec ids`)
+  - then decodes codec ids through exported `model.pte` (`codec ids -> wav`).
+
+This keeps the path runnable and measurable while preserving room to move
+talker generation into ExecuTorch in a follow-up phase.
+
+## Implemented architecture map
+
+### Conversion layer
+
+- `convert_weights.py`
+  - pulls local or remote HF snapshots.
+  - reads safetensor shards and extracts:
+    - speech decoder weights (`decoder.*` from `speech_tokenizer/`)
+    - optional talker weights (`talker.*` from root model)
+  - writes `decoder_metadata.json` for export/runtime contracts.
+
+### Export layer
+
+- `model.py`
+  - defines `Qwen3TTSSpeechDecoderExport` wrapper.
+  - computes output lengths from codec tokens and runs decoder forward.
+- `export_qwen3_tts.py`
+  - lowers wrapper to ExecuTorch.
+  - attaches `constant_methods` metadata:
+    - `output_sample_rate`
+    - `decode_upsample_rate`
+    - `num_quantizers`
+    - `codebook_size`
+    - `fixed_codes_len`
+  - supports fp32/bf16 and optional 8da4w quant for linear layers.
+
+### Runtime layer
+
+- `generate_codes.py`
+  - uses upstream `Qwen3TTSModel` for text->codec generation.
+  - supports:
+    - text-only mode (fallback x-vector prompt from generated silence)
+    - voice clone mode (`ref_audio` + optional `ref_text`)
+  - emits compact binary codec file consumed by C++ runner.
+- `qwen3_tts_runner.cpp`
+  - loads exported decoder `.pte`.
+  - optionally invokes helper script for codec generation.
+  - pads codec sequence to `fixed_codes_len` and decodes waveform.
+  - writes PCM16 WAV output.
+
+## Why fixed-length export is used
+
+- Initial dynamic-shape export failed with `torch.export` constraint violations
+  on `codes_len` for decoder internals.
+- Static export (`fixed_codes_len=1200`) was adopted to unblock XNNPACK
+  execution.
+- Runner-side padding with sentinel `-1` preserves true output trimming through
+  decoder length metadata.
+
+## Follow-up work suggested by this bring-up
+
+1. Move talker autoregressive generation into ExecuTorch methods
+   (prefill/decode-step style).
+2. Investigate BF16 decode runtime stall observed in current experiments.
+3. Add Metal backend support for the speech decoder.
+4. Replace helper-script dependency with fully in-runner ExecuTorch graph path.