Revert "Delete opt_mul_scalar_out (#12145)" (#12321)

swolchok · web-flow · commit 6ac5df22d211 · 2025-07-09T16:29:06.000-07:00
This triggered internal failures; kernels/optimized's tests don't build
with Buck in OSS because they use ATen.
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -210,6 +210,63 @@ Tensor& opt_mul_out(
   return out;
 }
 
+Tensor& opt_mul_scalar_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    Tensor& out) {
+  (void)ctx;
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType common_type =
+      utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false);
+  ScalarType out_type = out.scalar_type();
+
+  ET_CHECK(common_type == out_type);
+
+  if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
+    common_type = ScalarType::Float;
+  }
+
+  // Resize for dynamic shape
+  auto error = resize_tensor(out, a.sizes());
+  ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+
+  if (a_type == common_type && a_type == out_type &&
+      a_type != ScalarType::Half && a_type != ScalarType::BFloat16) {
+    ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE, [&]() {
+      CTYPE b_casted = utils::scalar_to<CTYPE>(b);
+
+      using Vec = at::vec::Vectorized<CTYPE>;
+      at::vec::map<CTYPE>(
+          [b_casted](Vec x) { return x * Vec(b_casted); },
+          out.mutable_data_ptr<CTYPE>(),
+          a.const_data_ptr<CTYPE>(),
+          out.numel());
+    });
+  } else {
+    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
+      ET_SWITCH_REALB_TYPES(
+          common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() {
+            ET_SWITCH_REALHBBF16_TYPES(
+                out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() {
+                  CTYPE_IN b_casted = utils::scalar_to<CTYPE_IN>(b);
+
+                  const size_t n = a.numel();
+                  const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
+                  CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+                  for (auto i = 0; i < n; ++i) {
+                    out_data[i] = static_cast<CTYPE_OUT>(
+                        static_cast<CTYPE_IN>(a_data[i]) * b_casted);
+                  }
+                });
+          });
+    });
+  }
+
+  return out;
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
@@ -82,6 +82,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_mul_out
 
+- op: mul.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_mul_scalar_out
+
 - op: native_layer_norm.out
   kernels:
     - arg_meta: null