Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ exclude_patterns = [
'extension/llm/tokenizers',
'extension/llm/tokenizers/**',
'examples/cuda',
'examples/nxp',
'kernels/portable',
# File contains @generated
'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
Expand Down
2 changes: 1 addition & 1 deletion backends/cortex_m/passes/quantized_op_fusion_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,5 +412,5 @@ def call_operator(
case _:
pass

result = super().call_operator(op, args, {}, meta)
result = super().call_operator(op, args, kwargs, meta)
return result
31 changes: 26 additions & 5 deletions backends/cortex_m/quantizer/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
# LICENSE file in the root directory of this source tree.


from typing import cast, List, Optional
from typing import Callable, cast, Iterator, List, Optional

from executorch.backends.arm.quantizer.arm_quantizer_utils import (
_mark_node_as_quantized,
NodeFinder,
PatternCheck,
PatternQuantizer,
SharedQspecQuantizer,
Expand All @@ -30,7 +31,7 @@
CORTEX_M_QUANTIZER_SUPPORT_DICT,
)
from torch._ops import OpOverload
from torch.fx import GraphModule
from torch.fx import GraphModule, Node
from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer


Expand All @@ -43,9 +44,27 @@ def mark_node_as_annotated(
_mark_node_as_quantized(node, input_qspec_map, output_qspec, is_quantized)


class FilteredNodeFinder(NodeFinder):
"""Wrapper that filters out nodes from another NodeFinder."""

def __init__(
self, base_finder: NodeFinder, filter_fn: Callable[[Node], bool] | None = None
) -> None:
self.base_finder = base_finder
self.filter_fn = filter_fn

def find_nodes(self, model: GraphModule) -> Iterator[Node]:
nested_generator = self.base_finder.find_nodes(model)

if self.filter_fn is None:
return nested_generator

return filter(self.filter_fn, nested_generator)


class CortexMQuantizer(ComposableQuantizer):

def __init__(self) -> None:
def __init__(self, filter_fn: Callable[[Node], bool] | None = None) -> None:
conv_targets: set[OpOverload] = set()
for key in CONV_OP_PATTERNS.keys() | CONV_TRANSPOSE_OP_PATTERNS.keys():
conv_targets.update(key)
Expand All @@ -63,12 +82,14 @@ def __init__(self) -> None:
quantizers: List[Quantizer] = [
PatternQuantizer(
INT8_PER_CHANNEL_CONFIG,
node_finder=NodeTargetNodeFinder(list(conv_targets)),
node_finder=FilteredNodeFinder(
NodeTargetNodeFinder(list(conv_targets)), filter_fn
),
pattern_matcher=pattern_matcher,
),
PatternQuantizer(
INT8_PER_TENSOR_CONFIG,
node_finder=GlobalNodeFinder(),
node_finder=FilteredNodeFinder(GlobalNodeFinder(), filter_fn),
pattern_matcher=pattern_matcher,
),
SharedQspecQuantizer(),
Expand Down
3 changes: 3 additions & 0 deletions backends/nxp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Key into `node.meta` dictionary. If it's mapped to `True`, the node will not be quantized by the NeutronQuantizer, and
# it will not be selected for delegation by the NeutronPartitioner.
NXP_NEUTRON_BACKEND_IGNORE = "NXP_NEUTRON_BACKEND_IGNORE"
4 changes: 4 additions & 0 deletions backends/nxp/backend/edge_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ def _is_quantize(node_: Node) -> bool:
]


def is_qdq_op(node: Node) -> bool:
return _is_quantize(node) or _is_dequantize(node)


def previous_non_qdq_node(node: Node, input_index: int = 0) -> Node | None:
"""Return the first node which is not a `quantize` or `dequantize`, found by traversing the graph backwards
starting with the `node.args[input_index]`,
Expand Down
Empty file.
186 changes: 186 additions & 0 deletions backends/nxp/imxrt700cm/imxrt700cm_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
# Copyright 2026 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.import copy

import copy
from typing import Callable

import torch

from executorch.backends.cortex_m.passes import CortexMPassManager
from executorch.backends.nxp.backend.custom_delegation_options import (
CustomDelegationOptions,
)
from executorch.backends.nxp.backend.edge_helper import is_qdq_op
from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
NeutronEdgePassManager,
)
from executorch.backends.nxp.imxrt700cm.imxrt700cm_quantizer import IMXRT700CMQuantizer
from executorch.backends.nxp.neutron_partitioner import (
NeutronPartitioner,
NXP_DELEGATION_TAG,
)
from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
from executorch.backends.nxp.quantizer.utils import calibrate_and_quantize
from executorch.backends.nxp.tests.executorch_pipeline import (
get_random_calibration_inputs,
ModelInputSpec,
to_model_input_spec,
to_quantized_edge_program,
)
from executorch.exir import EdgeProgramManager, to_edge
from executorch.exir.backend.partitioner import PartitionResult
from executorch.exir.dialects.edge._ops import EdgeOpOverload
from torch.fx import Node


def lower_to_imxrt700cm(
model: torch.nn.Module,
input_spec: tuple[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]],
get_calibration_inputs_fn: Callable[
[tuple[ModelInputSpec, ...]], list[tuple[torch.Tensor, ...]]
] = get_random_calibration_inputs,
target: str = "imxrt700",
remove_quant_io_ops: bool = False,
custom_delegation_options: CustomDelegationOptions = CustomDelegationOptions(), # noqa B008
use_neutron_for_format_conversion: bool = True,
use_quant_state_dict: bool = True,
fetch_constants_to_sram: bool = False,
dump_kernel_selection_code: bool = False,
) -> EdgeProgramManager:
"""Lower model to hybrid Neutron + Cortex-M backend.

Pipeline:
1. Identify nodes not supported by Neutron.
1.1 Quantize a copy of the model with 1 dummy calibration tensor.
1.2 Run NeutronPartitioner to mark supported nodes.
1.3 Extract the names of the aten operator unsupported by neutron.
2. Quantize using a hybrid quantizer, which applies NeutronQuantizer to some nodes, and CortexMQuantizer to others.
3. Lower to edge using Neutron backend.
4. Run Cortex-M passes on the edge program to replace leftover nodes with Cortex-M operators.

NOTE: Some Cortex-M operators require the channels last dim order. So the provided `model` and `example_inputs`
should use the channels last memory format for best results.

TODO (Martin) The Cortex-M backend requires some aten nodes to be preserved in the edge program.
This is not yet implemented. (EIEX-805)
"""
input_spec = to_model_input_spec(input_spec)

# Discover the names (stored in node.meta["torch_fn"][0]) of the aten operators which are not supported by Neutron.
# The Cortex-M backend will be used for these if possible.
cortex_m_designated_node_identifiers = _get_neutron_unsupported_node_identifiers(
model,
input_spec,
target,
)

# Use the standard Neutron lowering pipeline
edge_program_manager = to_quantized_edge_program(
model,
input_spec,
get_calibration_inputs_fn=get_calibration_inputs_fn,
target=target,
remove_quant_io_ops=remove_quant_io_ops,
custom_delegation_options=custom_delegation_options,
get_quantizer_fn=lambda: IMXRT700CMQuantizer(
NeutronTargetSpec(target), cortex_m_designated_node_identifiers
),
use_neutron_for_format_conversion=use_neutron_for_format_conversion,
use_quant_state_dict=use_quant_state_dict,
fetch_constants_to_sram=fetch_constants_to_sram,
dump_kernel_selection_code=dump_kernel_selection_code,
)

# Apply Cortex-M passes to replace the remaining nodes with Cortex-M variants where possible.
pass_manager = CortexMPassManager(edge_program_manager.exported_program())
edge_program_manager._edge_programs["forward"] = pass_manager.transform()

return edge_program_manager


def get_non_delegated_nodes(partition_result: PartitionResult) -> list[Node]:
"""Return a list of nodes which were not marked by the NeutronPartitioner for delegation."""

def _is_compute_op(node: Node) -> bool:
# Return `True` for call functions which represent edge operators (not getitem or ExecutorchCallDelegate, ...)
# Also exclude quantize/dequantize operations.
return (
node.op == "call_function"
and isinstance(node.target, EdgeOpOverload)
and not is_qdq_op(node)
)

return list(
filter(
lambda n: _is_compute_op(n) and NXP_DELEGATION_TAG not in n.meta,
partition_result.tagged_exported_program.graph.nodes,
)
)


def _get_neutron_unsupported_node_identifiers(
model: torch.nn.Module,
input_spec: tuple[ModelInputSpec, ...],
target: str,
use_quant_state_dict: bool = False,
) -> set[str]:
"""Identify nodes not supported by Neutron.
This is done by running quantization with dummy calibration data and then applying the NeutronPartitioner.

:param model: Input model to analyze.
:param input_spec: Tuple of objects containing information about the model inputs.
:param target: Neutron target to use for quantization.
:param use_quant_state_dict: If `True` the state dict from the quantized model will be used to assess operator
support by the NeutronPartitioner.
:return: Set of identifiers of nodes (stored in node.meta["torch_fn"][0]) which are not supported by Neutron.
"""
example_inputs = tuple(
torch.rand(spec.shape, dtype=spec.dtype) for spec in input_spec
)
exir_program_aten = torch.export.export(model, example_inputs, strict=True)

# Make a deep copy for discovery
copied_program = copy.deepcopy(exir_program_aten)

# Use the example input to quantize the model. There is no need to have a representative calibration dataset.
# Instead, we want to use just 1 sample as we only care about speed and which nodes are supported.
calibration_inputs = [example_inputs]

# Quantize with NeutronQuantizer
neutron_target_spec = NeutronTargetSpec(target)
copied_program_quantized = calibrate_and_quantize(
model=copied_program,
calibration_inputs=calibration_inputs,
quantizer=NeutronQuantizer(neutron_target_spec),
)

# Partition with Neutron
compile_spec = generate_neutron_compile_spec(target)

neutron_partitioner = NeutronPartitioner(
compile_spec,
neutron_target_spec,
post_quantization_state_dict=(
copied_program_quantized.state_dict() if use_quant_state_dict else None
),
)

epm = to_edge(
torch.export.export(copied_program_quantized, example_inputs, strict=True)
)
epm = epm.transform(NeutronEdgePassManager())
partition_result = neutron_partitioner.partition(epm.exported_program())

# Identify edge compute operators which were not delegated to Neutron.
non_delegated_edge_nodes = get_non_delegated_nodes(partition_result)

# Extract the identifiers of these nodes, which can be used to identify the original aten nodes.
non_delegated_node_identifiers = {
n.meta["torch_fn"][0] for n in non_delegated_edge_nodes if "torch_fn" in n.meta
}

return non_delegated_node_identifiers
104 changes: 104 additions & 0 deletions backends/nxp/imxrt700cm/imxrt700cm_quantizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright 2026 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.import copy

import torch

from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
from executorch.backends.nxp import NXP_NEUTRON_BACKEND_IGNORE
from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
from torch.fx import GraphModule, Node
from torchao.quantization.pt2e.quantizer import Quantizer
from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY


class IMXRT700CMQuantizer(Quantizer):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quantizers are meant to be composible. Recipe is the right user facing abstraction to target an SoC with multiple different backends. Take a look at https://github.com/pytorch/executorch/blob/main/export/tests/test_target_recipes.py especially something like get_android_recipe to understand how two or more quantizers / partitioners are encapsulated and made to work together.

In your case, I imagine a target recipe for rt700 with neutron and cortex-m.

Copy link
Collaborator Author

@MartinPavella MartinPavella Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @digantdesai for the insights. I have looked into it, and recipes definitely look like the right way forward.
I analyzed the state in executorch:

  • To introduce an SoC recipe would require having recipes for both Neutron and Cortex-M backend (both missing). Alternatively the current Cortex-M and Neutron pipelines can be combined into a single recipe but from reuse perspective a base recipe for both backend seems better from my opinion. Our Neutron backend pipeline is currently implemented in

    def to_quantized_edge_program(

  • The Neutron pipeline contains some kernel registration functionality, as only it knows what NPU kernels are requires. This would probably require the creation of a new Stage type

    def to_quantized_edge_program(

    Or at least I didn't find any stage providing the functionality to just execute a function based on presence of an option.

  • The QAT appears to not be supported. The QuantizeStage explicitly states it performs post-training quantization. I see that the SourceTransformStage also enables quantization in some way, but it doesn't seem QAT is supported. So perhaps this would require another new Stage type (or modification on an existing stage).

Given this, enabling the RT700 Neutron+Cortex-M backend via a recipe requires changes in multiple backends, and this PR would end up quite large. Can we do this in multiple stages? Such as:

  1. Experimentally, continue with this early implementation introducing the option to combine Cortex-M and Neutron Backends for the i.MXRT700.
  2. Rework the current Neutron lowering pipeline to a recipe, and the same for the Cortex-M backend. Here we would potentially introduce new Stages.
  3. Rework the imxrt700cm lowering to a recipe
  4. Based on consequent discussion extend for QAT training.

For Cortex-M we need to sync with Arm too.
What is your opinion?

"""Hybrid quantizer that uses NeutronQuantizer for Neutron supported ops and CortexMQuantizer for the rest."""

def __init__(
self,
target_spec: NeutronTargetSpec,
cortex_m_designated_node_identifiers: set[str],
):
"""
:param target_spec: Neutron target specification
:param cortex_m_designated_node_identifiers: Set of identifiers (stored in node.meta["torch_fn"][0]) of nodes
that are not supported by Neutron. Nodes not in this set will use
the NeutronQuantizer. Nodes in this set and nodes skipped by
NeutronQuantizer will be quantized with the CortexMQuantizer.
"""
super().__init__()
self.neutron_quantizer = NeutronQuantizer(target_spec)

def _should_quantize_with_cortex_m(n: Node) -> bool:
# The CortexMQuantizer should be used for nodes which were explicitly chosen, and for nodes which were not
# quantized by the NeutronQuantizer.
return (
n.meta.get(NXP_NEUTRON_BACKEND_IGNORE, False)
or Q_ANNOTATION_KEY not in n.meta
)

self.cortex_m_quantizer = CortexMQuantizer(
filter_fn=_should_quantize_with_cortex_m
)

self.target_spec = target_spec
self.cortex_m_designated_node_identifiers = cortex_m_designated_node_identifiers

def annotate(self, model: GraphModule) -> GraphModule:
# Due to how SharedQuantizationSpecs are used, skipping select ops in NeutronQuantizer would cause errors.
# Instead, first NeutronQuantizer is used to annotate all nodes in the model (that it supports). Then, the
# annotations are removed from nodes that should be handled by the Cortex-M backend, and the CortexMQuantizer
# is applied to these nodes only.

# Apply NeutronQuantizer.
model = self.neutron_quantizer.annotate(model)

# Mark the nodes which will be handled by the Cortex-M backend.
# This mark is used by the CortexMQuantizer, and later by the NeutronPartitioner.
self._mark_nodes_to_be_quantized_by_cortex_m_quantizer(
model, self.cortex_m_designated_node_identifiers
)

# Remove the annotations from nodes selected for the Cortex-M quantizer.
for node in model.graph.nodes:
if node.op != "call_function":
continue
if not node.meta.get(NXP_NEUTRON_BACKEND_IGNORE, False):
continue # Node should be quantized by NeutronQuantizer. All good.

# This node should be quantized using CortexMQuantizer. Remove the quantization annotation.
node.meta.pop(Q_ANNOTATION_KEY, None)
node.meta.pop("quantizer_matched", None)

# Apply CortexMQuantizer.
model = self.cortex_m_quantizer.annotate(model)

return model

def transform_for_annotation(
self, model: torch.fx.GraphModule
) -> torch.fx.GraphModule:
model = self.neutron_quantizer.transform_for_annotation(model)
model = self.cortex_m_quantizer.transform_for_annotation(model)
return model

def validate(self, model: GraphModule) -> None:
self.neutron_quantizer.validate(model)
self.cortex_m_quantizer.validate(model)

# noinspection PyMethodMayBeStatic
def _mark_nodes_to_be_quantized_by_cortex_m_quantizer(
self, graph: GraphModule, cortex_m_designated_node_identifiers: set[str]
):
"""Mark nodes which were selected to be handled by to Cortex-M backend.
The mark is `node.meta[NXP_NEUTRON_BACKEND_IGNORE] = True`
"""
for node in graph.graph.nodes:
if (torch_fn := node.meta.get("torch_fn")) is not None and torch_fn[
0
] in cortex_m_designated_node_identifiers:
# This node was selected specifically for the Cortex-M backend.
node.meta[NXP_NEUTRON_BACKEND_IGNORE] = True
Loading
Loading