Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions docs/source/compiler-memory-planning.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,46 @@ program = edge_program.to_executorch(
)
```

> **Note:** Custom pool passes that pre-assign `mem_id` are not yet compatible
> with `enable_non_cpu_memory_planning=True`. When per-device planning is
> enabled, device buffers are appended after the CPU buffers in the global
> `bufsizes` array. If a custom pass has already set `mem_id` values (e.g.
> `mem_id=2` or `mem_id=3`), those slots may collide with the device-buffer
> slots, leading to incorrect memory layout. If both features are enabled
> simultaneously, `apply_algo` will raise a `NotImplementedError`.

Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12).

## Device-Aware Memory Planning

When `enable_non_cpu_memory_planning=True` is set on `ExecutorchBackendConfig`,
the memory planning pass partitions tensor specs by their device type and runs
the planning algorithm independently for each device. This produces separate
memory buffers for each device (e.g. CPU vs. CUDA), ensuring that device memory
and host memory are never mixed.

```python
program = edge_program.to_executorch(
exir.ExecutorchBackendConfig(
enable_non_cpu_memory_planning=True,
)
)
```

The resulting `bufsizes` array layout depends on which devices are present:

| Scenario | bufsizes | Description |
|---|---|---|
| CPU only | `[0, cpu_size]` | Same as legacy behavior |
| CUDA only | `[0, cuda_size]` | Buffer 1 is CUDA, no wasted CPU slot |
| CPU + CUDA | `[0, cpu_size, cuda_size]` | Buffer 1 is CPU, buffer 2 is CUDA |

**Current limitations:**
- Not compatible with custom pool passes that pre-assign `spec.mem_id` (see note above).
- Submodule buffer sizes (from control-flow submodules like `cond`/`while`/`map`)
are applied only to the CPU partition. This is safe today because on-device
tensors only appear as delegate blob I/O, never inside control-flow submodules.

## Debugging Tool

Please refer to [Memory Planning Inspection](memory-planning-inspection.md) for a tool to inspect the result of memory planning.
6 changes: 6 additions & 0 deletions exir/capture/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,9 @@ class ExecutorchBackendConfig:

# Experimental: If set to true, we run a pass to reinplace ops in the graph.
run_reinplace_pass: bool = False

# When True, memory planning partitions specs by device and runs the
# algorithm independently per device, producing separate buffers for CPU
# vs. accelerator memory. Default False preserves the legacy behavior
# where all tensors are planned into CPU memory regardless of device.
enable_non_cpu_memory_planning: bool = False
162 changes: 133 additions & 29 deletions exir/memory_planning.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from executorch.exir.delegate import executorch_call_delegate
from executorch.exir.error import internal_assert, InternalError
from executorch.exir.operator.convert import is_inplace_variant, is_out_variant
from executorch.exir.schema import TensorShapeDynamism
from executorch.exir.schema import DeviceType, NonConstBufferDevice, TensorShapeDynamism
from executorch.exir.tensor import TensorSpec
from torch import fx
from torch.export.exported_program import (
Expand Down Expand Up @@ -1203,7 +1203,7 @@
return bufsizes


def apply_algo(

Check warning on line 1206 in exir/memory_planning.py

View workflow job for this annotation

GitHub Actions / lintrunner

FLAKE8 C901

'apply_algo' is too complex (14) See https://www.flake8rules.com/rules/C901.html.
algo: Callable[..., list[int]],
graph_module: torch.fx.GraphModule,
alignment: int,
Expand All @@ -1211,10 +1211,19 @@
alloc_graph_input: bool = True,
alloc_graph_output: bool = True,
alloc_mutable_buffers: bool = True,
enable_non_cpu_memory_planning: bool = False,
) -> list[int]:
"""
Recursively apply algo to graph_module and its submodules for control flow.

Partitions specs by device type and device idx, and runs the memory planning
algorithm independently per device, then merges results into separate buffers.
This ensures device memory and CPU memory are never mixed.

When enable_non_cpu_memory_planning is False (default), all specs are planned
into a single CPU memory pool regardless of their device attribute. This
preserves the legacy behavior. Set to True to enable per-device partitioning.

Algo implementation should handle one of two meta entries for submodules:
1. input_mem_buffer_sizes: List of int offset bytes. Memory allocated by
`algo` should start at the offset specified by this list;
Expand All @@ -1229,49 +1238,144 @@
`operand` arg. The memory for operands is unused.
"""
# Extract the nodes and their lifespans from the graph_module
# Difficult to just filter the list of specs returned by this due to
# how we flag trainable weights.
_ = update_all_tensors_lifetime(graph_module, graph_signature)

# Filter specs based on alloc_graph_input and alloc_graph_output
specs = collect_specs_from_nodes(
graph_module.graph.nodes,
graph_signature,
do_assertion=False,
ignore_graph_input=not alloc_graph_input,
ignore_graph_output=not alloc_graph_output,
ignore_mutable_buffers=not alloc_mutable_buffers,
# Collect and materialize specs into a set so we can iterate multiple
# times and partition by device.
all_specs: set[TensorSpec] = set(
collect_specs_from_nodes(
graph_module.graph.nodes,
graph_signature,
do_assertion=False,
ignore_graph_input=not alloc_graph_input,
ignore_graph_output=not alloc_graph_output,
ignore_mutable_buffers=not alloc_mutable_buffers,
)
)

# Get temporary specs for submodules to set aside space during execution
# of submodules.
# NOTE: submodule_bufsizes are currently applied only to the CPU partition.
# This assumes all control-flow submodule tensors (cond/while/map) live in
# CPU memory. Today this is safe because on-device tensors only appear as
# delegate blob I/O, which never lives inside control-flow submodules.
# If device tensors ever appear in submodules, _apply_algo_to_submodules
# will need per-device partitioning as well.
submodule_bufsizes = _apply_algo_to_submodules(
algo, graph_module, alignment, graph_signature
)

# Update `input_mem_buffer_sizes` in graph_module. This will allow existing
# algos to work using `input_mem_buffer_sizes` or use
# `non_const_buffer_sizes` directly.
# pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`.
graph_module.input_mem_buffer_sizes = submodule_bufsizes

# Get extra padding for XNNPACK if needed
extra_padding = 0
if _contains_xnnpack_delegate(graph_module):
extra_padding = 64

# Pass the filtered specs to the algorithm
bufsizes: list[int] = algo(
alignment,
specs,
graph_module,
graph_signature,
extra_padding,
# 1. Partition specs by (device_type, device_index).
# Different device indices on the same device type (e.g. CUDA:0 vs CUDA:1)
# get separate memory buffers.
_CPU_KEY: tuple[DeviceType, int] = (DeviceType.CPU, 0)
specs_by_device: dict[tuple[DeviceType, int], set[TensorSpec]] = defaultdict(set)
if enable_non_cpu_memory_planning:
has_non_cpu_specs = False
has_pre_assigned_mem_id = False
for spec in all_specs:
device_key = (spec.device, spec.device_index)
specs_by_device[device_key].add(spec)
if spec.device != DeviceType.CPU:
has_non_cpu_specs = True
if spec.mem_id is not None:
has_pre_assigned_mem_id = True

# Custom pool passes pre-assign mem_ids (e.g. mem_id=2, 3, …) to place
# tensors into specific memory arenas. Per-device partitioning appends
# device buffers after the CPU buffers, and the remap formula
# global_mem_id = (local_mem_id - 1) + base_mem_id
# assumes the algo-local numbering starts at 1. If a custom pass has
# already set mem_ids > 1 on the CPU side, the device-buffer slots may
# collide with those custom pool slots.
# TODO(gasoonjia): support custom pools + per-device planning by reserving
# device slots after the highest custom pool id.
if has_non_cpu_specs and has_pre_assigned_mem_id:
raise NotImplementedError(
"enable_non_cpu_memory_planning is not yet compatible with "
"custom memory pool passes that pre-assign spec.mem_id. "
"The per-device buffer slots may collide with custom pool "
"mem_ids. Please disable enable_non_cpu_memory_planning or "
"remove the custom mem_id assignments."
)
else:
# Legacy behavior: all specs planned into CPU memory regardless of device
specs_by_device[_CPU_KEY] = all_specs

# 2. Plan each device independently
global_bufsizes: list[int] = [0] # index 0 reserved for constants
# Track (device_type, device_index) for each buffer slot
buffer_devices: list[tuple[DeviceType, int]] = [_CPU_KEY]

# Process CPU:0 first (if present), then other devices sorted by
# (type.value, index) so the ordering is deterministic.
device_order = sorted(
specs_by_device.keys(),
key=lambda dk: (dk != _CPU_KEY, dk[0].value, dk[1]),
)

# pyre-ignore[6]: Incompatible parameter type [6]
# In call `insert_calls_to_free`, for 2nd positional argument, expected `Set[TensorSpec]` but got `Iterable[TensorSpec]`
insert_calls_to_free(graph_module, specs)
for device_key in device_order:
device_specs = specs_by_device[device_key]

graph_module.meta.update({"non_const_buffer_sizes": bufsizes})
return bufsizes
# Only apply submodule pre-allocation for CPU specs; device buffers
# do not share memory space with CPU submodule arenas.
# pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`.
graph_module.input_mem_buffer_sizes = (
submodule_bufsizes if device_key == _CPU_KEY else []
)

# Run algorithm independently on this device's specs
device_bufsizes = algo(
alignment, device_specs, graph_module, graph_signature, extra_padding
)

# Calculate base mem_id in global space
base_mem_id = len(global_bufsizes)

# Append buffer sizes (skip index 0 which is constants placeholder)
global_bufsizes.extend(device_bufsizes[1:])

# Track device key for each new buffer slot
for _ in device_bufsizes[1:]:
buffer_devices.append(device_key)

# Remap spec mem_ids from algo-local to global.
# At this point spec.mem_id has been set by MemoryPlanningAlgorithmSuite:
# the suite runs each algorithm (e.g. greedy), picks the best result,
# and writes the winning mem_id/mem_offset/mem_obj_id back onto each
# spec. For specs with no pre-assigned mem_id the algorithm defaults
# to mem_id=1; custom-pool passes may pre-assign other values (e.g. 3).
# We remap from the algo-local numbering (1-based) to the global
# position: global_mem_id = (local_mem_id - 1) + base_mem_id.
for spec in device_specs:
if spec.mem_id is not None:
spec.mem_id = (spec.mem_id - 1) + base_mem_id

# Ensure backward compatibility: at least [0, 0] when no specs exist
if len(global_bufsizes) < 2:
global_bufsizes.append(0)
buffer_devices.append(_CPU_KEY)

# 3. Insert free calls and build device buffer mapping
insert_calls_to_free(graph_module, all_specs)

# Only record non-CPU buffer entries. CPU buffers are the default and
# do not need explicit device metadata in the serialized program.
non_const_buffer_device: Optional[list[NonConstBufferDevice]] = None
has_device_buffers = any(dk[0] != DeviceType.CPU for dk in buffer_devices)
if has_device_buffers:
non_const_buffer_device = [
NonConstBufferDevice(buffer_idx=i, device_type=dt, device_index=di)
for i, (dt, di) in enumerate(buffer_devices)
if (dt, di) != _CPU_KEY
]

graph_module.meta["non_const_buffer_sizes"] = global_bufsizes
if non_const_buffer_device is not None:
graph_module.meta["non_const_buffer_device"] = non_const_buffer_device
return global_bufsizes
3 changes: 3 additions & 0 deletions exir/passes/memory_planning_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ def __init__(
alloc_mutable_buffers: bool = True,
share_mutable_buffers: bool = False,
alignment: int = ALIGNMENT,
enable_non_cpu_memory_planning: bool = False,
) -> None:
r"""
alloc_graph_input/alloc_graph_output will have 4 different combinations
Expand All @@ -173,6 +174,7 @@ def __init__(
self.alloc_mutable_buffers = alloc_mutable_buffers
self.share_mutable_buffers = share_mutable_buffers
self.alignment = alignment
self.enable_non_cpu_memory_planning = enable_non_cpu_memory_planning
self.state = _MemoryPlanningState()

def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
Expand Down Expand Up @@ -250,6 +252,7 @@ def run(
# If mutable buffers are shared, then do not allocate them in the
# main memory planning algo; they are allocated in run_multimethod.
self.alloc_mutable_buffers and not self.share_mutable_buffers,
self.enable_non_cpu_memory_planning,
)

if self.share_mutable_buffers and graph_signature is not None:
Expand Down
6 changes: 6 additions & 0 deletions exir/program/_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -1788,6 +1788,12 @@ def to_executorch( # noqa (FLAKE8) C901
)
else:
memory_planning_pass = config.memory_planning_pass
# Propagate enable_non_cpu_memory_planning from the top-level config
# to the pass instance so that device-aware partitioning is applied.
if hasattr(memory_planning_pass, "enable_non_cpu_memory_planning"):
memory_planning_pass.enable_non_cpu_memory_planning = (
config.enable_non_cpu_memory_planning
)
# TODO(jakeszwe): Follow up with compiler on if the deepcopy is necessary and if so how to make it work
if hasattr(memory_planning_pass, "run"):
new_gm_res = memory_planning_pass.run(new_gm, new_signature)
Expand Down
Loading
Loading