intel
diff --git a/‎.bom‎
Lines changed: 5 additions & 2 deletions b/‎.bom‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎examples/cpu/inference/python/models/dlrm/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions b/‎examples/cpu/inference/python/models/dlrm/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/cpu/inference/python/models/dlrm/README.md‎
Lines changed: 129 additions & 0 deletions b/‎examples/cpu/inference/python/models/dlrm/README.md‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎examples/cpu/inference/python/models/dlrm/__init__.py‎ b/‎examples/cpu/inference/python/models/dlrm/__init__.py‎
diff --git a/‎examples/cpu/inference/python/models/dlrm/data_process/__init__.py‎ b/‎examples/cpu/inference/python/models/dlrm/data_process/__init__.py‎
diff --git a/‎examples/cpu/inference/python/models/dlrm/data_process/dlrm_dataloader.py‎
Lines changed: 173 additions & 0 deletions b/‎examples/cpu/inference/python/models/dlrm/data_process/dlrm_dataloader.py‎
Lines changed: 173 additions & 0 deletions
@@ -62,7 +62,7 @@ types-dataclasses
 neural-compressor
 oneTBB
 oneCCL
-onednn-graph 
+onednn-graph
 oneDNN
 jemalloc
 click
@@ -118,7 +118,7 @@ pyyaml
 schema
 setuptools
 Triton
-urllib3 
+urllib3
 libcurl
 intel-media-va-driver-non-free
 libmfx1
@@ -165,3 +165,6 @@ g++-12
 backoff
 torchaudio
 einops
+torchao
+torchrec
+torchmetrics
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(aoti_example)
+
+find_package(Torch REQUIRED)
+
+add_executable(aoti_example bench.cpp model.so)
+
+target_link_libraries(aoti_example "${TORCH_LIBRARIES}")
+set_property(TARGET aoti_example PROPERTY CXX_STANDARD 17)
@@ -0,0 +1,129 @@
+# DLRM v2 Inference
+
+DLRM v2 Inference best known configurations with PyTorch.
+
+## Model Information
+
+| **Use Case** | **Framework** | **Model Repo** | **Branch/Commit/Tag** | **Optional Patch** |
+|:---:| :---: |:--------------:|:---------------------:|:------------------:|
+|  Inference   |    PyTorch    |       https://github.com/facebookresearch/dlrm/tree/main/torchrec_dlrm       |           -           |         -          |
+
+# Pre-Requisite
+## Bare Metal
+### General setup
+
+Install Pytorch, TorchVison and jeMalloc.
+```
+git clone https://github.com/yanbing-j/pytorch.git
+cd pytorch
+git checkout yanbing/tf32_dev_branch_for_test/
+git submodule sync
+git submodule update --init --recursive
+conda install cmake ninja
+pip install -r requirements.txt
+pip install mkl-static mkl-include
+python setup.py install
+cd ..
+
+git clone https://github.com/shiyang-weng/ao.git
+cd ao
+git checkout wengshiy/scaled_mm
+git submodule sync
+git submodule update --init --recursive
+pip install -r requirements.txt
+python setup.py install
+cd ..
+
+conda install jemalloc
+```
+
+### Model Specific Setup
+
+* Set Jemalloc and tcmalloc Preload for better performance
+
+  The jemalloc and tcmalloc should be built from the [General setup](#general-setup) section.
+  ```
+  export LD_PRELOAD="<path to the jemalloc directory>/lib/libjemalloc.so":"path_to/tcmalloc/lib/libtcmalloc.so":$LD_PRELOAD
+  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
+  ```
+* Set IOMP preload for better performance
+  ```
+  pip install packaging intel-openmp
+  export LD_PRELOAD=path/lib/libiomp5.so:$LD_PRELOAD
+  ```
+
+## Datasets
+The dataset can be downloaded and preprocessed by following https://github.com/mlcommons/training/tree/master/recommendation_v2/torchrec_dlrm#create-the-synthetic-multi-hot-dataset.
+We also provided a preprocessed scripts based on the instruction above. `preprocess_raw_dataset.sh`.
+After you loading the raw dataset `day_*.gz` and unzip them to RAW_DIR.
+```bash
+cd intel-extension-for-pytorch/examples/cpu/inference/python/models/dlrm/
+export MODEL_DIR=$(pwd)
+export RAW_DIR=<the unziped raw dataset>
+export TEMP_DIR=<where you choose the put the temp file during preprocess>
+export PREPROCESSED_DIR=<where you choose the put the one-hot dataset>
+export MULTI_HOT_DIR=<where you choose the put the multi-hot dataset>
+bash preprocess_raw_dataset.sh
+```
+
+## Pre-Trained checkpoint
+You can download and unzip checkpoint by following
+https://github.com/mlcommons/inference/tree/master/recommendation/dlrm_v2/pytorch#downloading-model-weights
+
+## Inference
+1. `git clone https://github.com/intel/intel-extension-for-pytorch.git`
+2. `cd intel-extension-for-pytorch/examples/cpu/inference/python/models/dlrm/`
+3. Create virtual environment `venv` and activate it:
+    ```
+    python3 -m venv venv
+    . ./venv/bin/activate
+    ```
+4. Install general model requirements
+    ```
+    ./setup.sh
+    ```
+
+5. Setup required environment paramaters
+
+| **Parameter**                |                                  **export command**                                  |
+|:---------------------------:|:------------------------------------------------------------------------------------:|
+| **TEST_MODE** (THROUGHPUT, ACCURACY)              | `export TEST_MODE=THROUGHPUT`                  |
+| **DATASET_DIR**             |                               `export DATASET_DIR=<multi-hot dataset dir>`                                  |
+| **WEIGHT_DIR** (ONLY FOR ACCURACY)     |                 `export WEIGHT_DIR=<offical released checkpoint>`        |
+| **PRECISION**    |                               `export PRECISION=int8 <specify the precision to run: int8, fp32, bf32, bf16 or tf32>`                             |
+| **OUTPUT_DIR**    |                               `export OUTPUT_DIR=$PWD`                               |
+| **BATCH_SIZE** (optional) |                               `export BATCH_SIZE=<set a value for batch size, else it will run with default batch size>`                                |
+| **TORCH_INDUCTOR** (optional) |                               `export TORCH_INDUCTOR=<0 or 1>`                                |
+| **FP8** (optional) |                               `export FP8=<0 or 1>`                                |
+
+6. Run `run_model.sh`
+## Output
+
+Single-tile output will typically look like:
+
+```
+2024-07-18 15:58:00,970 - dlrm_main.py - __main__ - INFO - EVAL_START, EPOCH_NUM: 0
+2024-07-18 16:00:14,120 - dlrm_main.py - __main__ - INFO - AUROC over test set: [0.5129603203103565, 0.0, 0.0].
+2024-07-18 16:00:14,121 - dlrm_main.py - __main__ - INFO - Number of test samples: 131072
+2024-07-18 16:00:14,121 - dlrm_main.py - __main__ - INFO - Throughput: 103711.5248249468 fps
+2024-07-18 16:00:14,121 - dlrm_main.py - __main__ - INFO - Final AUROC: [0.5129603203103565, 0.0, 0.0]
+2024-07-18 16:00:17,133 - dlrm_main.py - __main__ - INFO - AUROC over test set: [0.5129603203103565, 0.0, 0.0].
+2024-07-18 16:00:17,133 - dlrm_main.py - __main__ - INFO - Number of test samples: 131072
+2024-07-18 16:00:17,133 - dlrm_main.py - __main__ - INFO - Throughput: 102890.12235101678 fps
+2024-07-18 16:00:17,134 - dlrm_main.py - __main__ - INFO - Final AUROC: [0.5129603203103565, 0.0, 0.0]
+```
+
+
+Final results of the inference run can be found in `results.yaml` file.
+```
+results:
+ - key: throughput
+   value: 102890.122
+   unit: fps
+ - key: latency
+   value: N/A
+   unit: s
+ - key: accuracy
+   value: 0.513
+   unit: ROC AUC
+```
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import List
+
+from torch.utils.data import DataLoader
+from torchrec.datasets.criteo import (
+    CAT_FEATURE_COUNT,
+    DAYS,
+    DEFAULT_CAT_NAMES,
+    DEFAULT_INT_NAMES,
+    InMemoryBinaryCriteoIterDataPipe,
+)
+from torchrec.datasets.random import RandomRecDataset
+
+# OSS import
+try:
+    # pyre-ignore[21]
+    # @manual=//ai_codesign/benchmarks/dlrm/torchrec_dlrm/data:multi_hot_criteo
+    from data.multi_hot_criteo import MultiHotCriteoIterDataPipe
+
+except ImportError:
+    pass
+
+# internal import
+try:
+    from .multi_hot_criteo import MultiHotCriteoIterDataPipe  # noqa F811
+except ImportError:
+    pass
+
+STAGES = ["train", "val", "test"]
+
+
+def _get_random_dataloader(
+    args: argparse.Namespace,
+    stage: str,
+) -> DataLoader:
+    attr = f"limit_{stage}_batches"
+    num_batches = getattr(args, attr)
+    if stage in ["val", "test"] and args.test_batch_size is not None:
+        batch_size = args.test_batch_size
+    else:
+        batch_size = args.batch_size
+    return DataLoader(
+        RandomRecDataset(
+            keys=DEFAULT_CAT_NAMES,
+            batch_size=batch_size,
+            hash_size=args.num_embeddings,
+            hash_sizes=(
+                args.num_embeddings_per_feature
+                if hasattr(args, "num_embeddings_per_feature")
+                else None
+            ),
+            manual_seed=args.seed if hasattr(args, "seed") else None,
+            ids_per_feature=1,
+            num_dense=len(DEFAULT_INT_NAMES),
+            num_batches=num_batches,
+        ),
+        batch_size=None,
+        batch_sampler=None,
+        pin_memory=args.pin_memory,
+        num_workers=0,
+    )
+
+
+def _get_in_memory_dataloader(
+    args: argparse.Namespace,
+    stage: str,
+) -> DataLoader:
+    if args.in_memory_binary_criteo_path is not None:
+        dir_path = args.in_memory_binary_criteo_path
+        sparse_part = "sparse.npy"
+        datapipe = InMemoryBinaryCriteoIterDataPipe
+    else:
+        dir_path = args.synthetic_multi_hot_criteo_path
+        sparse_part = "sparse_multi_hot.npz"
+        datapipe = MultiHotCriteoIterDataPipe
+
+    if stage == "train":
+        stage_files: List[List[str]] = [
+            [os.path.join(dir_path, f"day_{i}_dense.npy") for i in range(DAYS - 1)],
+            [os.path.join(dir_path, f"day_{i}_{sparse_part}") for i in range(DAYS - 1)],
+            [os.path.join(dir_path, f"day_{i}_labels.npy") for i in range(DAYS - 1)],
+        ]
+    elif stage in ["val", "test"]:
+        stage_files: List[List[str]] = [
+            [os.path.join(dir_path, f"day_{DAYS-1}_dense.npy")],
+            [os.path.join(dir_path, f"day_{DAYS-1}_{sparse_part}")],
+            [os.path.join(dir_path, f"day_{DAYS-1}_labels.npy")],
+        ]
+    if stage in ["val", "test"] and args.test_batch_size is not None:
+        batch_size = args.test_batch_size
+    else:
+        batch_size = args.batch_size
+    dataloader = DataLoader(
+        datapipe(
+            stage,
+            *stage_files,  # pyre-ignore[6]
+            batch_size=batch_size,
+            rank=0,
+            world_size=1,
+            drop_last=args.drop_last_training_batch if stage == "train" else False,
+            shuffle_batches=args.shuffle_batches,
+            shuffle_training_set=args.shuffle_training_set,
+            shuffle_training_set_random_seed=args.seed,
+            mmap_mode=args.mmap_mode,
+            hashes=(
+                args.num_embeddings_per_feature
+                if args.num_embeddings is None
+                else ([args.num_embeddings] * CAT_FEATURE_COUNT)
+            ),
+        ),
+        batch_size=None,
+        pin_memory=args.pin_memory,
+        collate_fn=lambda x: x,
+    )
+    return dataloader
+
+
+def get_dataloader(args: argparse.Namespace, backend: str, stage: str) -> DataLoader:
+    """
+    Gets desired dataloader from dlrm_main command line options. Currently, this
+    function is able to return either a DataLoader wrapped around a RandomRecDataset or
+    a Dataloader wrapped around an InMemoryBinaryCriteoIterDataPipe.
+
+    Args:
+        args (argparse.Namespace): Command line options supplied to dlrm_main.py's main
+            function.
+        backend (str): "nccl" or "gloo".
+        stage (str): "train", "val", or "test".
+
+    Returns:
+        dataloader (DataLoader): PyTorch dataloader for the specified options.
+
+    """
+    stage = stage.lower()
+    if stage not in STAGES:
+        raise ValueError(f"Supplied stage was {stage}. Must be one of {STAGES}.")
+
+    args.pin_memory = (
+        (backend == "nccl") if not hasattr(args, "pin_memory") else args.pin_memory
+    )
+
+    if (
+        args.in_memory_binary_criteo_path is None
+        and args.synthetic_multi_hot_criteo_path is None
+    ):
+        return _get_random_dataloader(args, stage)
+    else:
+        return _get_in_memory_dataloader(args, stage)