Do not raise error when quant primitives are left after partitioner (#10573)

metascroy · web-flow · commit a2b995227081 · 2025-04-30T18:58:18.000-07:00
Currently _sanity_check_graph_for_non_decomp_ops raises an Exception if
a delegate asks an op for preservation, but doesn't lower it.

In general, this is a sensible thing to do, but for quant primitives, it
is less sensible. Since what gets lowered are patterns involving the
quant primitives and FP32 ops.

XNNPACK asks that quant primitives be preserved, and so if a quant
primitive is not lowered (e.g., it is part of embedding quant), an error
is thrown.

In this PR, we:

* Define a central location for _QUANT_PRIMITIVES (with TODO task of
moving this to torchao)
* Use these _QUANT_PRIMITIVES to avoid raising an error in
_sanity_check_graph_for_non_decomp_ops
* Use _QUANT_PRIMITIVES in tracer.py to no decompose during to_edge and
const_prop_pass to not constant propagate (this logic existed
previously, but is being rewritten using the central _QUANT_PRIMITIVES
list).
diff --git a/exir/TARGETS b/exir/TARGETS
@@ -15,8 +15,8 @@ python_library(
         ":types",
         "//caffe2:torch",
         "//executorch/exir/operator:convert",
+        "//executorch/exir/operator::util",
         "//executorch/extension/pytree:pylib",
-        "//pytorch/ao:torchao",
     ],
 )
 
diff --git a/exir/operator/TARGETS b/exir/operator/TARGETS
@@ -32,6 +32,8 @@ python_library(
     ],
     deps = [
         "//caffe2/torchgen:torchgen",
+        "//pytorch/ao:torchao",
+        "//caffe2:torch",
     ],
 )
 
diff --git a/exir/operator/util.py b/exir/operator/util.py
@@ -6,6 +6,8 @@
 
 # pyre-strict
 
+import torch
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 from torchgen.model import FunctionSchema, SchemaKind
 from torchgen.native_function_generation import (
     functional_to_out_signature,
@@ -39,3 +41,28 @@ def gen_out_variant_schema(func_op_schema: str) -> str:
         raise RuntimeError(f"SchemaKind: {func.kind()} is not supported")
 
     return f"{namespace}::{schema}" if namespace else schema
+
+
+# TODO: move to torchao
+_QUANT_PRIMITIVES = [
+    torch.ops.quantized_decomposed.dequantize_per_channel.default,
+    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+    torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.convert_element_type.no_fuse,
+    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+    torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.quantize_per_channel.default,
+    torch.ops.quantized_decomposed.choose_qparams.tensor,
+]
+try:
+    import torchao  # noqa: F401
+
+    _QUANT_PRIMITIVES.extend(
+        [
+            torch.ops.torchao.dequantize_affine.default,
+            torch.ops.torchao.quantize_affine.default,
+            torch.ops.torchao.choose_qparams_affine.default,
+        ]
+    )
+except ImportError:
+    pass
diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS
@@ -119,6 +119,8 @@ python_library(
         "//caffe2:torch",
         "//executorch/exir/dialects:lib",
         "//executorch/exir/dialects/edge:lib",
+        "//executorch/exir/operator::util",
+        "//executorch/exir/passes:replace_aten_with_edge_pass",
     ],
 )
 
diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
@@ -13,6 +13,8 @@
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.operator.util import _QUANT_PRIMITIVES
+from executorch.exir.passes.replace_aten_with_edge_pass import aten_to_edge
 from torch._export.utils import (
     get_buffer,
     get_lifted_tensor_constant,
@@ -25,35 +27,13 @@
 from torch.export.exported_program import InputKind, InputSpec, TensorArgument
 from torch.utils import _pytree as pytree
 
-
 # Avoid propagating constants for `exir.ops.edge.aten.full.default`.
 # Propagating aten.full can significantly increase compiled model size.
 _DEFAULT_SKIP_TARGETS = {exir_ops.edge.aten.full.default}
 
 # Do not const prop quantization primitives
-_QDQ_OPS = [
-    exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
-    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
-    exir_ops.edge.quantized_decomposed.convert_element_type.no_fuse,
-    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-    exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
-    exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
-    exir_ops.edge.quantized_decomposed.choose_qparams.tensor,
-]
-try:
-    import torchao  # noqa: F401
-
-    _QDQ_OPS.extend(
-        [
-            exir_ops.edge.torchao.dequantize_affine.default,
-            exir_ops.edge.torchao.quantize_affine.default,
-            exir_ops.edge.torchao.choose_qparams_affine.default,
-        ]
-    )
-except ImportError:
-    pass
-_DEFAULT_SKIP_TARGETS.update(set(_QDQ_OPS))
+_QUANT_PRIMITIVES_EDGE = [aten_to_edge(op) for op in _QUANT_PRIMITIVES]
+_DEFAULT_SKIP_TARGETS.update(set(_QUANT_PRIMITIVES_EDGE))
 
 
 _PRIMITIVE_TYPES = (
diff --git a/exir/program/TARGETS b/exir/program/TARGETS
@@ -35,6 +35,7 @@ python_library(
         "//executorch/exir/capture:config",
         "//executorch/exir/emit:emit",
         "//executorch/exir/emit:lib",
+        "//executorch/exir/operator:util",
         "//executorch/exir/passes:insert_write_back_for_buffers_pass",
         "//executorch/exir/passes:lib",
         "//executorch/exir/passes:normalize_view_copy_base_pass",
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -32,6 +32,7 @@
 from executorch.exir.error import ExportError
 from executorch.exir.graph_module import get_control_flow_submodules
 from executorch.exir.operator.convert import _pybind_schema_to_native_schema
+from executorch.exir.operator.util import _QUANT_PRIMITIVES
 from executorch.exir.pass_base import PassBase
 from executorch.exir.pass_manager import PassType
 from executorch.exir.passes import (
@@ -971,10 +972,14 @@ def _sanity_check_graph_for_non_decomp_ops(
     ops_set_to_not_decompose = {
         aten_to_edge(op) for op in ops_set_to_not_decompose
     }.union(ops_set_to_not_decompose)
+
+    quant_primitives = {aten_to_edge(op) for op in _QUANT_PRIMITIVES}
     for node in program.graph_module.graph.nodes:
         is_op_supported = check_op_support(node) if check_op_support else True
         if (
-            node.op == "call_function" and node.target in ops_set_to_not_decompose
+            node.op == "call_function"
+            and node.target in ops_set_to_not_decompose
+            and node.target not in quant_primitives
         ) and is_op_supported:
             warning_str = (
                 f"Node {node} with op {node.target} was not decomposed or delegated.\n"
@@ -988,7 +993,9 @@ def _sanity_check_graph_for_non_decomp_ops(
         for node in submod.graph.nodes:
             is_op_supported = check_op_support(node) if check_op_support else True
             if (
-                node.op == "call_function" and node.target in ops_set_to_not_decompose
+                node.op == "call_function"
+                and node.target in ops_set_to_not_decompose
+                and node.target not in quant_primitives
             ) and is_op_supported:
                 warning_str = (
                     f"Node {node} with op {node.target} was not decomposed or delegated.\n"
diff --git a/exir/tracer.py b/exir/tracer.py
@@ -41,6 +41,7 @@
 from executorch.exir.error import ExportError, ExportErrorType, InternalError
 from executorch.exir.graph_module import LeafValue
 from executorch.exir.operator.convert import is_out_variant
+from executorch.exir.operator.util import _QUANT_PRIMITIVES
 from executorch.exir.types import ValueSpec
 
 from torch._C import _EnableTorchFunction, DisableTorchFunctionSubclass  # @manual
@@ -54,7 +55,6 @@
 
 from typing_extensions import TypeAlias
 
-
 Value: TypeAlias = Union[
     LeafValue,
     Tuple["Value", ...],
@@ -643,22 +643,7 @@ def _default_decomposition_table(
     # pyre-fixme[7]: Expected `Dict[OpOverload, typing.Callable[..., executorch.exir....
 
     never_decompose = []
-    try:
-        # Do not decompose torchao quant primitives
-        # They have decompositions registered for inductor/CUDA, but in ExecuTorch we
-        # just pattern match them and lower to delegates
-        import torchao  # noqa: F401
-
-        never_decompose.extend(
-            [
-                torch.ops.torchao.quantize_affine.default,
-                torch.ops.torchao.dequantize_affine.default,
-                torch.ops.torchao.choose_qparams_affine.default,
-            ]
-        )
-    except:
-        pass
-
+    never_decompose.extend(_QUANT_PRIMITIVES)
     for op in never_decompose:
         decomps.pop(op, None)
     return decomps  # pyre-fixme[7]

Original file line number	Diff line number	Diff line change
`@@ -15,8 +15,8 @@ python_library(`
`15`	`15`	`":types",`
`16`	`16`	`"//caffe2:torch",`
`17`	`17`	`"//executorch/exir/operator:convert",`
	`18`	`+ "//executorch/exir/operator::util",`
`18`	`19`	`"//executorch/extension/pytree:pylib",`
`19`		`- "//pytorch/ao:torchao",`
`20`	`20`	`],`
`21`	`21`	`)`
`22`	`22`
Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,8 @@ python_library(`
`32`	`32`	`],`
`33`	`33`	`deps = [`
`34`	`34`	`"//caffe2/torchgen:torchgen",`
	`35`	`+ "//pytorch/ao:torchao",`
	`36`	`+ "//caffe2:torch",`
`35`	`37`	`],`
`36`	`38`	`)`
`37`	`39`
Original file line number	Diff line number	Diff line change
`@@ -119,6 +119,8 @@ python_library(`
`119`	`119`	`"//caffe2:torch",`
`120`	`120`	`"//executorch/exir/dialects:lib",`
`121`	`121`	`"//executorch/exir/dialects/edge:lib",`
	`122`	`+ "//executorch/exir/operator::util",`
	`123`	`+ "//executorch/exir/passes:replace_aten_with_edge_pass",`
`122`	`124`	`],`
`123`	`125`	`)`
`124`	`126`