pytorch · SS-JIA · Mar 25, 2026
@@ -9,97 +9,97 @@
 #version 450 core
 
 ${define_required_extensions("texture3d", DTYPE)}
-${define_explicit_type_extensions(DTYPE)}
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
 
 ${define_active_storage_type("texture3d")}
 
+#extension GL_EXT_control_flow_attributes : require
+
 layout(std430) buffer;
 
-#include "indexing_utils.h"
+#include "common.glslh"
+#include "indexing.glslh"
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
 
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
 layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
+  ivec4 permute_dims;
 };
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-const lowp int in_packed_dim = unhash_packed_dim(in_layout);
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
+const int out_packed_dim = get_packed_dim(out_layout);
+const int in_packed_dim = get_packed_dim(in_layout);
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// Convert output tensor index to input tensor index based on permutation
+// Convert output tensor index to input tensor index based on permutation.
+// permute_dims[i] = j means output dim i comes from input dim j.
+// We write: in_tidx[permute_dims.{x,y,z,w}] = out_tidx.{x,y,z,w}
+// This uses literal component access on the push constant (safe) and dynamic
+// indexing into the local in_tidx variable (also safe).
 ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
   ivec4 in_tidx;
-
-  // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
   in_tidx[permute_dims.x] = out_tidx.x;
   in_tidx[permute_dims.y] = out_tidx.y;
   in_tidx[permute_dims.z] = out_tidx.z;
   in_tidx[permute_dims.w] = out_tidx.w;
-
   return in_tidx;
 }
 
-// Check if we can use the fast path where texels from the input tensor can be
-// copied directly into the output tensor. This occurs when the packed dimension
-// is preserved in the permutation, i.e. reading a texel from the output tensor
-// produces 4 texels along the same dimension as reading a texel from the input
-// tensor.
-bool can_use_fast_path() {
-  // Fast path is possible when the packed dimension is preserved in the permutation
-  // This means permute_dims[out_packed_dim] == in_packed_dim
-  return permute_dims[out_packed_dim] == in_packed_dim;
-}
-
 void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(out_tidx, out_sizes))) {
+  if (out_of_bounds(out_pos, outp)) {
     return;
   }
 
-  if (can_use_fast_path()) {
+  TensorIndex4D out_tidx =
+      texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout);
+
+  // Check if packed dimension is preserved in the permutation. Use safe_idx
+  // to avoid dynamic indexing of push constant with spec-const-derived index.
+  const bool fast_path =
+      safe_idx(permute_dims, out_packed_dim) == in_packed_dim;
+
+  if (fast_path) {
     // Fast path: packed dimension is preserved, so we can copy texels directly
-    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+    ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data);
+    TensorIndex4D in_tidx;
+    in_tidx.data = in_tidx_data;
 
-    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
-  }
-  else {
+    ivec3 in_pos =
+        tensor4d_idx_to_texel_pos_simple(inp, in_tidx, in_layout);
+    VEC4_T in_texel = texelFetch(t_in, in_pos, 0);
+
+    imageStore(t_out, out_pos, in_texel);
+  } else {
     // Slow path: packed dimension is not preserved, so each element of the
-    // output texel may be "sourced" from a different texel in the input tensor.
-    // Therefore each output texel element is processed individually.
+    // output texel may come from a different texel in the input tensor.
     VEC4_T out_texel = VEC4_T(0);
 
-    for (int texel_i = 0; texel_i < 4; ++texel_i) {
-      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-      int element_idx = in_tidx[in_packed_dim] % 4;
+    for (int comp = 0; comp < 4; comp++) {
+      ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data);
+      TensorIndex4D in_tidx;
+      in_tidx.data = in_tidx_data;
 
-      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
-      T selected_value = T(in_texel[element_idx]);
+      TextureElementIndex in_elem =
+          tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout);
 
-      out_texel[texel_i] = selected_value;
+      VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0);
+      out_texel[comp] = in_texel[in_elem.comp];
 
-      out_tidx[out_packed_dim]++;
+      out_tidx.data[out_packed_dim]++;
     }
 
-    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
+    imageStore(t_out, out_pos, out_texel);
   }
 }