pytorch
diff --git a/‎backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/_passes/qnn_pass_manager.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/_passes/qnn_pass_manager.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/_passes/recompose_rms_norm.py
Lines changed: 58 additions & 47 deletions b/‎backends/qualcomm/_passes/recompose_rms_norm.py
Lines changed: 58 additions & 47 deletions
diff --git a/‎backends/qualcomm/_passes/utils.py
Lines changed: 66 additions & 1 deletion b/‎backends/qualcomm/_passes/utils.py
Lines changed: 66 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/quantizer/annotators.py
Lines changed: 5 additions & 1 deletion b/‎backends/qualcomm/quantizer/annotators.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎backends/qualcomm/scripts/build.sh
Lines changed: 3 additions & 0 deletions b/‎backends/qualcomm/scripts/build.sh
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/models/llama/export_llama_lib.py
Lines changed: 2 additions & 3 deletions b/‎examples/models/llama/export_llama_lib.py
Lines changed: 2 additions & 3 deletions
@@ -105,7 +105,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                         padding = [0] + node.args[4] if num_args > 4 else [0, 0]
                         if node.target == torch.ops.aten.conv1d.default:
                             dilation = [1] + node.args[5] if num_args > 5 else [1, 1]
-                            groups = node.args[6] if num_args > 5 else 1
+                            groups = node.args[6] if num_args > 6 else 1
                             conv_args = (
                                 qdq_node_after_unsqueeze,
                                 node.args[1],
 
@@ -90,7 +90,7 @@ def get_capture_program_passes():
         (I64toI32, True),
         (LayoutTransform, True),
         (RecomposePixelUnshuffle, True),
-        (RecomposeRmsNorm, False),
+        (RecomposeRmsNorm, True),
         (Remove0DTensor, True),
         (RemoveRedundancy, True),
         (TagQuantIO, False),
@@ -188,6 +188,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(RemoveRedundancy(quantization_capture=True))
         self.add_pass(ReduceDynamicRange())
         self.add_pass(RecomposePixelUnshuffle(quantization_capture=True))
+        self.add_pass(RecomposeRmsNorm(quantization_capture=True))
         self.add_pass(ReplaceArangeArgs())
         self.add_pass(DecomposeCDist())
         self.add_pass(DecomposeScaledDotProductAttention())
 
@@ -3,84 +3,95 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from executorch.backends.qualcomm._passes.utils import find_patterns
 import torch
 
-from executorch.backends.qualcomm.builders.node_visitor import dq_ops
-from executorch.backends.qualcomm.builders.utils import get_parameter, is_parameter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
+def _is_node(node): return isinstance(node, torch.fx.Node)
+def _is_call(node): return _is_node(node) and node.op == 'call_function'
+def _is_placeholder(node): return _is_node(node) and node.op == 'placeholder'
+def _is_get_attr(node): return _is_node(node) and node.op == 'get_attr'
+def _is_add(node): return _is_call(node) and node.target in [exir_ops.edge.aten.add.Tensor, torch.ops.aten.add.Tensor]
+def _is_mean(node): return _is_call(node) and node.target in [exir_ops.edge.aten.mean.dim, torch.ops.aten.mean.dim]
+def _is_mul(node): return _is_call(node) and node.target in [exir_ops.edge.aten.mul.Tensor, torch.ops.aten.mul.Tensor]
+def _is_pow(node): return _is_call(node) and node.target in [exir_ops.edge.aten.pow.Tensor_Tensor, torch.ops.aten.pow.Tensor_Scalar]
+def _is_rsqrt(node): return _is_call(node) and node.target in [exir_ops.edge.aten.rsqrt.default, torch.ops.aten.rsqrt.default]
 
 class RecomposeRmsNorm(ExportPass):
     """
     Merge decomposed operators back to one super node.
-    TODO: After replacing export_to_edge with to_edge_transform_and_lowering
-    in examples/models/llama/export_llama_lib.py, this pass can be removed
     """
 
-    def __init__(self, edge_program: torch.export.ExportedProgram):
+    def __init__(self, quantization_capture=False):
         super(RecomposeRmsNorm, self).__init__()
-        self.edge_program = edge_program
-
-    def _get_eps_node(self, nodes):
-        # eps: one of inputs of add node
-        add_node = [n for n in nodes if hasattr(n, "name") and "add" in n.name][0]
-        for a in add_node.args:
-            if isinstance(a, float) or a.op != "call_function":
-                return a
-
-    def _get_gamma_node(self, output_node):
-        # gamma: one of inputs of output node
-        for a in output_node.args:
-            if a.op != "call_function" or a.target in dq_ops:
-                return a
+        self.rms_norm_target = exir_ops.edge.aten.rms_norm.default
+        self.skip_targets = [exir_ops.edge.aten.to.dtype,]
+        if quantization_capture:
+            self.rms_norm_target = torch.ops.aten.rms_norm.default
+            self.skip_targets = [torch.ops.aten.to.dtype,]
+    
+    def _get_input_node(self, node):
+        input_node = node.args[0]
+        while input_node.target in self.skip_targets:
+            input_node = input_node.args[0]
+        return input_node
 
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
-        partitions = get_source_partitions(
-            graph, [torch.nn.RMSNorm, torch.ops.aten.rms_norm.default]
-        )
-        for _, src_partitions in partitions.items():
-            for src_partition in src_partitions:
-                input_len = len(src_partition.input_nodes)
-                if input_len == 1:
-                    input_node = src_partition.input_nodes[0]
-                elif input_len == 2:
-                    inp_0, inp_1 = src_partition.input_nodes
-                    input_node = inp_0 if len(inp_0.users) == 2 else inp_1
-                else:
-                    raise RuntimeError(
-                        f"Found a edge case of rms_node partition {src_partition}, which has {input_len} inputs"
-                    )
 
-                output_node = src_partition.output_nodes[0]
-                eps = self._get_eps_node(src_partition.nodes)
-                if isinstance(eps, torch.fx.Node) and is_parameter(
-                    eps, self.edge_program
-                ):
-                    eps = get_parameter(eps, self.edge_program).item()
-                gamma_node = self._get_gamma_node(output_node)
+        # Root Mean Square normalization math equivalent implementation
+        patterns = [
+            # transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm
+            [_is_mul, '*', _is_mul, _is_rsqrt, _is_add, _is_mean, _is_pow], 
+            # executorch.examples.models.llama.norm.RMSNorm
+            [_is_mul, '*', _is_mul, _is_rsqrt, _is_add, _is_mean, _is_mul],
+        ]
+
+        for node in graph.nodes:
+            if not _is_mul(node):
+                continue
+            
+            rms_norm_patterns = [pattern for pattern in find_patterns(node, patterns) if pattern is not None]
+
+            if len(rms_norm_patterns)>0:
+                # Use first matched pattern
+                rms_norm_pattern = rms_norm_patterns[0][0]
+                last_mul_node = rms_norm_pattern[0]
+                gamma_node = None
+                # weight should be a constant
+                for arg in last_mul_node.args:
+                    if _is_get_attr(arg) or _is_placeholder(arg):
+                        gamma_node = arg
+                if gamma_node is None:
+                    continue
+
+                eps = rms_norm_pattern[4].args[1]
+                if isinstance(eps, torch.fx.Node):
+                    eps = eps.meta['val'].constant.item()
+                input_node = self._get_input_node(rms_norm_pattern[6])
 
-                with graph.inserting_before(output_node):
+                with graph.inserting_before(last_mul_node):
                     # args schema
                     # (Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
                     rms_node = graph.create_node(
                         "call_function",
-                        exir_ops.edge.aten.rms_norm.default,
+                        self.rms_norm_target,
                         (
                             input_node,
                             list(gamma_node.meta["val"].shape),
                             gamma_node,
                             eps,
                         ),
                     )
-                    users = output_node.users.copy()
+                    users = last_mul_node.users.copy()
                     for user in users:
-                        user.replace_input_with(output_node, rms_node)
+                        user.replace_input_with(last_mul_node, rms_node)
                     # copy metadata
-                    rms_node.meta = output_node.meta
+                    rms_node.meta = last_mul_node.meta            
 
         graph.eliminate_dead_code()
         graph_module.recompile()
         return PassResult(graph_module, True)
+    
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Dict
+from typing import Callable, Dict, List
 
 import torch
 from executorch.backends.qualcomm.builders.utils import get_parameter
@@ -121,3 +121,68 @@ def is_float_tensor(node: torch.fx.Node) -> bool:
     if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor):
         return False
     return node.meta["val"].dtype == torch.float32
+
+def _find_pattern(node: torch.fx.Node, pattern: List[Callable[[torch.fx.Node], bool] | str], from_args: bool=True, max_wildcard_life: int=3, verbose: bool=False):
+    '''Implement wildcard pattern matching
+        - node: fx.Node
+        - pattern: predicate list, can contain followings
+            Callable(fx.node): predicate
+            '*': wildcard
+        - from_args: if True find from node.args, otherwise from node.users
+        - max_wildcard_life: max number of skips for wildcard
+
+    If not matched, return None.
+    Otherwise, return list of matched node list, which is the same length as pattern
+    '''
+    def _is_node(node): return isinstance(node, torch.fx.Node)
+    def _pred(node, pat): return isinstance(pat, Callable) and pat(node)
+    def _next(node):
+        if from_args:
+            yield from [i for i in node.args if _is_node(i)]
+        else:
+            yield from [i for i in node.users]
+
+    asterisk = '*'
+
+    def _probe(cur, hist, pat_idx, asterisk_life_count=max_wildcard_life, verbose=verbose):
+        if pat_idx == len(pattern):
+            assert len(hist) == len(pattern)
+            if  list(hist) not in matched:
+                matched.append(list(hist))
+            return
+        if verbose:
+            print(f"cur:{cur}, idx:{pat_idx}, life={asterisk_life_count}, pattern:{pattern[pat_idx]} hist={hist}")
+        if _pred(cur, pattern[pat_idx]):
+            hist.append(cur)
+            for child in _next(cur):
+                _probe(child, hist, pat_idx+1)
+            hist.pop(-1)
+        elif pattern[pat_idx] == asterisk and asterisk_life_count>0:
+            # 3 cases: ignore/consume/keep asterisk
+            # 1, Ignore asterisk
+            hist.append(None)
+            _probe(cur, hist, pat_idx+1)
+            hist.pop(-1)
+
+            # 2. Consume asterisk
+            hist.append(None)
+            for child in _next(cur):
+                _probe(child, hist, pat_idx+1)
+            hist.pop(-1)
+
+            # 3. keep asterisk and skip to next node
+            for child in _next(cur):
+                _probe(child, hist, pat_idx, asterisk_life_count-1)
+
+    matched = []
+    _probe(node, [], 0)
+    return matched if matched else None
+
+
+def find_patterns(node, patterns, **kwargs):
+    assert isinstance(patterns, list) and isinstance(patterns[0], list)
+    results = []
+    for pattern in patterns:
+        result = _find_pattern(node, pattern, **kwargs)
+        results.append(result)
+    return results
@@ -59,6 +59,8 @@
 QNN_TENSOR_TYPE_MAP = {
     torch.bool: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
     torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
+     # Note that there is no float64 tensor data type in Qnn.
+    torch.float64: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
     torch.int8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_8,
     torch.int16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_16,
     torch.int32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
 
@@ -127,7 +127,6 @@ def annotate_single_in_share_out(
             _annotated=True,
         )
 
-
 def annotate_single_in(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
@@ -163,6 +162,11 @@ def annotate_single_in_single_out(
         )
 
 
+@register_annotator([torch.ops.aten.to.dtype])
+def annotate_to_dtype(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.atan.default])
 def annotate_atan(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
@@ -85,6 +85,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI='arm64-v8a' \
@@ -105,6 +106,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DANDROID_PLATFORM=android-30 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -B$EXAMPLE_ROOT
@@ -119,6 +121,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_PLATFORM=android-30 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -B$LLAMA_EXAMPLE_ROOT
 
@@ -977,7 +977,6 @@ def _to_edge_and_lower_llama(  # noqa: C901
         dep_table = get_passes_dependency_for_capture_program()
         passes_job[AnnotateStack][QCOM_PASS_ACTIVATE_KEY] = True
         passes_job[ConvertBmmToMatmul][QCOM_PASS_ACTIVATE_KEY] = True
-        passes_job[RecomposeRmsNorm][QCOM_PASS_ACTIVATE_KEY] = True
         passes_job[TagQuantIO][QCOM_PASS_ACTIVATE_KEY] = True
         passes_job[TagQuantIO][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][
             "get_quant_io_dtype_fn"
@@ -1410,14 +1409,14 @@ def _get_source_transforms(  # noqa
                     transforms.append(get_model_with_r1_r2(optimized_rotation_path))
                 transforms.append(replace_attention_to_attention_sha)
                 transforms.append(replace_causal_mask)
-                transforms.append(replace_rms_norm_with_native_rms_norm)
+                # transforms.append(replace_rms_norm_with_native_rms_norm)
                 # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
                 transforms.append(convert_linear_to_conv2d)
             else:
                 transforms.append(replace_kv_cache_with_simple_kv_cache)
                 transforms.append(replace_sdpa_with_flex_sdpa)
                 transforms.append(replace_causal_mask)
-                transforms.append(replace_rms_norm_with_native_rms_norm)
+                # transforms.append(replace_rms_norm_with_native_rms_norm)
                 if optimized_rotation_path:
                     transforms.append(fuse_layer_norms)
                     transforms.append(get_model_with_r1_r2(optimized_rotation_path))