pytorch
diff --git a/‎backends/qualcomm/_passes/build_quant_io.py
Lines changed: 5 additions & 0 deletions b/‎backends/qualcomm/_passes/build_quant_io.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/__init__.py
Lines changed: 4 additions & 0 deletions b/‎backends/qualcomm/builders/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/op_copy.py
Lines changed: 64 additions & 0 deletions b/‎backends/qualcomm/builders/op_copy.py
Lines changed: 64 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/op_slice_scatter.py
Lines changed: 121 additions & 0 deletions b/‎backends/qualcomm/builders/op_slice_scatter.py
Lines changed: 121 additions & 0 deletions
diff --git a/‎backends/qualcomm/partition/common_defs.py
Lines changed: 0 additions & 2 deletions b/‎backends/qualcomm/partition/common_defs.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/qualcomm/quantizer/annotators.py
Lines changed: 16 additions & 0 deletions b/‎backends/qualcomm/quantizer/annotators.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎backends/qualcomm/scripts/build.sh
Lines changed: 14 additions & 0 deletions b/‎backends/qualcomm/scripts/build.sh
Lines changed: 14 additions & 0 deletions
diff --git a/‎backends/qualcomm/tests/models.py
Lines changed: 21 additions & 0 deletions b/‎backends/qualcomm/tests/models.py
Lines changed: 21 additions & 0 deletions
@@ -39,6 +39,11 @@ def _build(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
             if QCOM_QUANTIZED_IO in n.meta:
                 n.meta["val"] = n.meta["val"].to(dtype=n.meta[QCOM_QUANTIZED_IO])
 
+        spec = []
+        for user in list(call_delegate[0].users):
+            spec.append(self._make_spec(user.meta["val"]))
+        call_delegate[0].meta["spec"] = tuple(spec)
+
     def call(self, graph_module: torch.fx.GraphModule):
         self._build(graph_module)
         graph_module.graph.eliminate_dead_code()
 
@@ -21,6 +21,7 @@
     op_ceil,
     op_clamp,
     op_conv2d,
+    op_copy,
     op_cos,
     op_cum_sum,
     op_depth_to_space,
@@ -78,6 +79,7 @@
     op_sin,
     op_skip_ops,
     op_slice_copy,
+    op_slice_scatter,
     op_softmax,
     op_space_to_depth,
     op_split_with_sizes,
@@ -114,6 +116,7 @@
     op_ceil,
     op_clamp,
     op_conv2d,
+    op_copy,
     op_cos,
     op_cum_sum,
     op_depth_to_space,
@@ -171,6 +174,7 @@
     op_sin,
     op_skip_ops,
     op_slice_copy,
+    op_slice_scatter,
     op_softmax,
     op_space_to_depth,
     op_split_with_sizes,
 
@@ -0,0 +1,64 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
+
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
+from .qnn_constants import OpReshape, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Copy(NodeVisitor):
+    target = ["aten.copy.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = self.get_node(node.args[1])
+        input_tensor = self.get_tensor(input_node, node)
+        copy_inp_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        copy_input_tensors = [copy_inp_tensor_wrapper]
+
+        if quant_attrs := input_node.meta.get(QCOM_QUANT_ATTRS):
+            quant_attrs = quant_attrs.copy()
+            # Because there is no output after convert_pt2e, the QCOM_QUANT_ATTRS of node is none
+            node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        copy_output_tensors = [output_tensor_wrapper]
+
+        copy_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReshape.op_name,
+        )
+        copy_op.AddInputTensors(copy_input_tensors)
+        copy_op.AddOutputTensors(copy_output_tensors)
+
+        return copy_op
@@ -0,0 +1,121 @@
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
+from .qnn_constants import (
+    OpScatterNd,
+    QNN_OP_PACKAGE_NAME_QTI_AISW,
+)
+
+
+@register_node_visitor
+class SliceScatterVisitor(NodeVisitor):
+    target = ["aten.slice_scatter.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = self.get_node(node.args[0])
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        value_node = self.get_node(node.args[1])
+        value_tensor = self.get_tensor(value_node, node)
+        value_tensor_wrapper = self.define_tensor(
+            value_node,
+            node,
+            value_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        dim = cast(int, node.args[2])
+        if dim < 0:
+            dim = dim % len(input_tensor.shape)
+
+        start = 0 if node.args[3] is None else cast(int, node.args[3])
+        if start < 0:
+            start = start % input_tensor.shape[dim]
+
+        if len(node.args) > 4:
+            end = min(cast(int, node.args[4]), input_tensor.shape[dim])
+            if end < 0:
+                end = end % input_tensor.shape[dim]
+        else:
+            end = input_tensor.shape[dim]
+        
+        step = node.args[5] if len(node.args) > 5 else 1
+
+        target_index_shape = []
+        ranges = []
+        # Collect the index
+        for i in range(dim+1):
+            if i == dim:
+                target_range = torch.tensor(range(start, end, step), dtype=torch.int32)
+                target_index_shape.append(target_range.size(-1))
+                ranges.append(target_range)
+                break
+            else:
+                size = input_tensor.size(i)
+                target_index_shape.append(size)
+                ranges.append(torch.arange(size, dtype=torch.int32))
+        # last dim means x-tuple index
+        target_index_shape.append(dim+1)
+        target_index_tensor = torch.cartesian_prod(*ranges).reshape(target_index_shape).contiguous()
+
+        
+        target_index_node = torch.fx.Node(
+            node.graph,
+            node.name + "_target_index",
+            "call_function",
+            exir_ops.edge.aten.tensor.default,
+            (),  # args
+            {},  # kwargs
+        )
+        target_index_tensor_wrapper = self.define_tensor(
+            target_index_node,
+            node,
+            target_index_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+        )
+
+        index_put_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpScatterNd.op_name,
+        )
+        index_put_op.AddInputTensors(
+            [
+                input_tensor_wrapper,
+                target_index_tensor_wrapper,
+                value_tensor_wrapper,
+            ]
+        )
+        index_put_op.AddOutputTensors([output_tensor_wrapper])
+
+        return index_put_op
@@ -11,8 +11,6 @@
 
 not_supported_operator = [
     exir_ops.edge.aten.clone.default,
-    exir_ops.edge.aten.slice_scatter.default,
-    exir_ops.edge.aten.copy.default,
     exir_ops.edge.quantized_decomposed.embedding_4bit.dtype,
 ]
 
 
@@ -643,6 +643,21 @@ def annotate_select(node: Node, quantization_config: QuantizationConfig) -> None
 def annotate_slice(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
+@register_annotator([torch.ops.aten.slice_scatter.default])
+def annotate_slice_scatter(node: Node, quantization_config: QuantizationConfig) -> None:
+    input = node.args[0]
+    value = node.args[1]
+
+    input_qspec_map = {}
+    input_qspec_map[input] = quantization_config.input_activation
+    input_qspec_map[value] = SharedQuantizationSpec((input, node))
+
+    node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=SharedQuantizationSpec((input, node)),
+        _annotated=True,
+    )
+
 
 @register_annotator([torch.ops.aten.sqrt.default])
 def annotate_sqrt(node: Node, quantization_config: QuantizationConfig) -> None:
@@ -1028,6 +1043,7 @@ def annotate_cdist(node: Node, quantization_config: QuantizationConfig) -> None:
         torch.ops.aten.conv1d.default,
         torch.ops.aten.conv_transpose2d.input,
         torch.ops.aten.conv_transpose1d.default,
+        torch.ops.aten.convolution.default,
     ]
 )
 def annotate_conv(node: Node, quantization_config: QuantizationConfig) -> None:
 
@@ -110,6 +110,20 @@ if [ "$BUILD_AARCH64" = true ]; then
         -B$EXAMPLE_ROOT
 
     cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER
+
+    LLAMA_EXAMPLE_ROOT=examples/models/llama
+    cmake $PRJ_ROOT/$LLAMA_EXAMPLE_ROOT \
+        -DBUILD_TESTING=OFF \
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+        -DANDROID_ABI='arm64-v8a' \
+        -DANDROID_PLATFORM=android-30 \
+        -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
+        -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
+        -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+        -B$LLAMA_EXAMPLE_ROOT
+
+    cmake --build $LLAMA_EXAMPLE_ROOT -j$BUILD_JOB_NUMBER
 fi
 
 if [ "$BUILD_X86_64" = true ]; then
 
@@ -228,6 +228,17 @@ def __init__(self):
     def forward(self, x, y):
         return torch.cat((y, y, x, x), axis=2)
 
+class CausalMask(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer("causal_mask", torch.zeros((1,1, 1, 128)))
+        self.mask_length = 128
+
+    def forward(self, padding_mask):
+        self.causal_mask[:, :, :, :self.mask_length] = self.causal_mask[:, :, :, :self.mask_length].masked_fill(
+                    padding_mask, 1
+                )
+        return self.causal_mask+1
 
 class CDist(torch.nn.Module):
     def __init__(self):
@@ -1592,6 +1603,16 @@ def forward(self, x, y):
             + self.position_ids[:, : seq_length : self.step]
         )
 
+class SliceScatter(torch.nn.Module):
+    def __init__(self, dim, start, end, step):
+        super().__init__()
+        self.dim = dim
+        self.start = start
+        self.end = end
+        self.step = step
+
+    def forward(self, x, y):
+        return x.slice_scatter(y, dim=self.dim, start=self.start, end=self.end, step=self.step)
 
 class Softmax(torch.nn.Module):
     def __init__(self, dim):
Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,6 @@`
`11`	`11`
`12`	`12`	`not_supported_operator = [`
`13`	`13`	`exir_ops.edge.aten.clone.default,`
`14`		`- exir_ops.edge.aten.slice_scatter.default,`
`15`		`- exir_ops.edge.aten.copy.default,`
`16`	`14`	`exir_ops.edge.quantized_decomposed.embedding_4bit.dtype,`
`17`	`15`	`]`
`18`	`16`