diff --git a/Libraries/oneCCL/README.md b/Libraries/oneCCL/README.md
new file mode 100644
index 0000000000..7bf8446d9a
--- /dev/null
+++ b/Libraries/oneCCL/README.md
@@ -0,0 +1,19 @@
+# Intel oneAPI Collective Communications Library (oneCCL)
+
+Collective Communication Library is a library providing an efficient implementation of communication patterns usedin deep learning.
+
+Github : https://github.com/oneapi-src/oneCCL
+
+## License  
+The code samples are licensed under MIT license
+
+# oneCCL samples
+
+| Type      | Name                 | Description                                                  |
+| --------- | ----------------------- | ------------------------------------------------------------ |
+| Component | [oneCCL_Getting_Started](oneCCL_Getting_Started)     | Those C++ & C API example demonstrates basic of oneCCL programming model by invoking different collective operations such as allreduce. |
+| Component | [tutorials](tutorials)     | Hands-on Jupyter notebook tutorials among different topics. |
+>  Notice : Please use Intel oneAPI DevCloud as the environment for jupyter notebook samples. \
+Users can refer to [DevCloud Getting Started](https://devcloud.intel.com/oneapi/get-started/) for using DevCloud \
+Users can use JupyterLab from DevCloud via "One-click Login in", and download samples via "git clone" or the "oneapi-cli" tool \
+Once users are in the JupyterLab with downloaded jupyter notebook samples, they can start following the steps without further installion needed.
diff --git a/Libraries/oneCCL/oneCCL_Getting_Started/CMakeLists.txt b/Libraries/oneCCL/oneCCL_Getting_Started/CMakeLists.txt
new file mode 100644
index 0000000000..7b151c24a8
--- /dev/null
+++ b/Libraries/oneCCL/oneCCL_Getting_Started/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 2.8.11)
+if("${CMAKE_CXX_COMPILER}" STREQUAL "")
+       set(CMAKE_C_COMPILER "clang")
+       set(CMAKE_CXX_COMPILER "dpcpp")
+endif()
+project (oneCCL_Getting_Started)
+if("$ENV{EXAMPLE_ROOT}" STREQUAL "")
+	message(" - use default examples")
+	if($ENV{CCL_CONFIGURATION} MATCHES "cpu_gpu_dpcpp")
+		file(COPY $ENV{CCL_ROOT}/examples/sycl DESTINATION src)
+	endif()
+	file(COPY $ENV{CCL_ROOT}/examples/cpu DESTINATION src)
+	file(COPY $ENV{CCL_ROOT}/examples/common DESTINATION src)
+	file(COPY $ENV{CCL_ROOT}/examples/benchmark DESTINATION src)
+	file(COPY $ENV{CCL_ROOT}/examples/include DESTINATION src)
+	file(COPY $ENV{CCL_ROOT}/examples/CMakeLists.txt DESTINATION src)
+	add_subdirectory (${PROJECT_BINARY_DIR}/src out)
+else()
+	add_subdirectory ($ENV{EXAMPLE_ROOT} out)
+endif()
diff --git a/Libraries/oneCCL/oneCCL_Getting_Started/License.txt b/Libraries/oneCCL/oneCCL_Getting_Started/License.txt
new file mode 100644
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/Libraries/oneCCL/oneCCL_Getting_Started/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Libraries/oneCCL/oneCCL_Getting_Started/README.md b/Libraries/oneCCL/oneCCL_Getting_Started/README.md
new file mode 100644
index 0000000000..7002e25a18
--- /dev/null
+++ b/Libraries/oneCCL/oneCCL_Getting_Started/README.md
@@ -0,0 +1,178 @@
+# oneCCL Getting Started samples
+The CCL sample codes are implemented using C++, C and DPC++ language for CPU and GPU. 
+By using all reduce collective operation samples, users can understand how to compile oneCCL codes with various oneCCL configurations in Intel oneAPI environment.  
+
+| Optimized for                     | Description  
+|:---                               |:---
+| OS                                | Linux Ubuntu 18.04; 
+| Hardware                          | Kaby Lake with GEN9 or newer
+| Software                          | Intel oneAPI Collective Communications Library (oneCCL), Intel oneAPI DPC++/C++ Compiler, Intel oneAPI DPC++ Library (oneDPL), GNU Compiler
+| What you will learn               | basic oneCCL programming model for both Intel CPU and GPU
+| Time to complete                  | 15 minutes
+
+## List of Samples
+| C++ API | C API | Collective Operation |
+| ------ | ------ | ------ |
+| sycl_allreduce_cpp_test.cpp | sycl_allreduce_test.cpp |[Allreduce](https://intel.github.io/oneccl/spec/communication_primitives.html#allreduce) |
+| cpu_allreduce_cpp_test.cpp | cpu_allreduce_test.cpp/cpu_allreduce_bfp16.c |[Allreduce](https://intel.github.io/oneccl/spec/communication_primitives.html#allreduce) |
+>  Notice : Please use Intel oneAPI DevCloud as the environment for jupyter notebook samples. \
+Users can refer to [DevCloud Getting Started](https://devcloud.intel.com/oneapi/get-started/) for using DevCloud \
+Users can use JupyterLab from DevCloud via "One-click Login in", and download samples via "git clone" or the "oneapi-cli" tool \
+Once users are in the JupyterLab with download jupytered notebook samples, they can start following the steps without further installion needed.
+
+## Purpose
+The samples implement the allreduce collective operation with oneCCL APIs. 
+With the samples users will learn how to compile the code with various oneCCL configurations in Intel oneAPI environment.
+
+## License  
+Those code samples are licensed under MIT license
+
+## Prerequisites
+
+### CPU
+
+-----
+
+The samples below require the following components, which are part of the [Intel oneAPI DL Framework Developer Toolkit (DLFD Kit)
+](https://software.intel.com/en-us/oneapi/dldev-kit)
+*  Intel oneAPI Collective Communications Library (oneCCL)
+
+You can refer to this page [oneAPI](https://software.intel.com/en-us/oneapi) for toolkit installation.
+
+
+### GPU and CPU
+
+-----
+
+The samples below require the following components, which are part of the [Intel oneAPI Base Tookit](https://software.intel.com/en-us/oneapi/oneapi-kit)
+*  Intel oneAPI Collective Communications Library (oneCCL)
+*  Intel oneAPI DPC++/C++ Compiler
+*  Intel oneAPI DPC++ Library (oneDPL)
+
+The samples also require OpenCL driver. Please refer [System Requirements](https://software.intel.com/en-us/articles/intel-oneapi-base-toolkit-system-requirements) for OpenCL driver installation.
+
+
+You can refer to this page [oneAPI](https://software.intel.com/en-us/oneapi) for toolkit installation.
+
+
+
+
+## Building the samples for CPU and GPU 
+
+### on a Linux* System  
+
+#### CPU only:
+
+- Build the samples  with GCC for CPU only \
+  please replace ${ONEAPI_ROOT} for your installation path. \
+  ex : /opt/intel/oneapi \
+  Don't need to replace {DPCPP_CMPLR_ROOT} 
+  ```
+  source ${ONEAPI_ROOT}/setvars.sh --ccl-configuration=cpu_icc
+
+  cd oneapi-toolkit/oneCCL/oneCCL_Getting_Started   
+  mkdir build  
+  cd build 
+  cmake .. -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++
+  make cpu_allreduce_cpp_test
+  ```
+> NOTE: The source file "cpu_allreduce_cpp_test.cpp" will be copied from ${INTEL_ONEAPI_INSTALL_FOLDER}/ccl/latest/examples/cpu to build/src/cpu folder.
+Users can rebuild the cpu_allreduce_cpp_test.cpp by typing "make cpu_allreduce_cpp_test" under build folder.
+
+#### GPU and CPU:
+
+- Build the samples  with SYCL for GPU and CPU \
+  please replace ${ONEAPI_ROOT} for your installation path. \
+  ex : /opt/intel/oneapi \
+  Don't need to replace {DPCPP_CMPLR_ROOT} 
+  ```
+  source ${ONEAPI_ROOT}/setvars.sh --ccl-configuration=cpu_gpu_dpcpp
+
+  cd oneapi-toolkit/oneCCL/oneCCL_Getting_Started  
+  mkdir build  
+  cd build 
+  cmake ..  -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp
+  make sycl_allreduce_cpp_test
+  ```
+> NOTE: The source file "sycl_allreduce_cpp_test.cpp" will be copied from ${INTEL_ONEAPI_INSTALL_FOLDER}/ccl/latest/examples/sycl to build/src/sycl folder.
+Users can rebuild the sycl_allreduce_cpp_test.cpp by typing "make sycl_allreduce_cpp_test" under build folder.
+
+### Include Files
+The include folder is located at ${CCL_ROOT}}\include on your development system".
+
+## Running the Sample  
+
+### on a Linux* System  
+
+#### CPU only:
+- Run the program \
+  take cpu_allreduce_cpp_test for example. \
+  you can apply those steps for all other sample binaries. \
+  please replace the {NUMBER_OF_PROCESSES} with integer number accordingly
+
+  ```
+  mpirun -n ${NUMBER_OF_PROCESSES} ./out/cpu/cpu_allreduce_cpp_test 
+  ```
+  
+  ex: 
+  ```
+  mpirun -n 2 ./out/cpu/cpu_allreduce_cpp_test
+  ``` 
+  
+
+#### GPU and CPU:
+- Run the program \
+  take sycl_allreduce_cpp_test for example. \
+  you can apply those steps for all other sample binaries. \
+  please replace the {NUMBER_OF_PROCESSES} with integer number accordingly
+
+  ```
+  mpirun -n ${NUMBER_OF_PROCESSES} ./out/sycl/sycl_allreduce_cpp_test gpu|cpu|host|default
+  ```
+  
+  ex: run on GPU
+  ```
+  mpirun -n 2 ./out/sycl/sycl_allreduce_cpp_test gpu
+  ``` 
+  
+
+### Example of Output
+
+#### on Linux 
+- Run the program on CPU or GPU following [How to Run Section](#running-the-sample)
+- CPU Results
+
+  ```
+  Provided device type: cpu
+  Running on Intel(R) Core(TM) i7-7567U CPU @ 3.50GHz
+  Example passes
+  ```
+  please note that name of running device may vary according to your environment
+  
+
+- GPU Results
+  ```
+  Provided device type: gpu
+  Running on Intel(R) Gen9 HD Graphics NEO
+  Example passes
+  ```
+  please note that name of running device may vary according to your environment
+  
+- Enable oneCCL Verbose log 
+
+  There are different log levels in oneCCL. Users can refer to below table for different log levels. 
+  
+  | CCL_LOG_LEVEL | value 
+  | :------ | :------ 
+  | ERROR | 0   
+  | INFO | 1    
+  | DEBUG | 2   
+  | TRACE | 3    
+  
+  
+  Users can enable oneCCL verbose log by following below command to see more 
+  runtime information from oneCCL.
+  ```
+  export CCL_LOG_LEVEL=1
+  ```
+
diff --git a/Libraries/oneCCL/oneCCL_Getting_Started/sample.json b/Libraries/oneCCL/oneCCL_Getting_Started/sample.json
new file mode 100644
index 0000000000..3feba3c102
--- /dev/null
+++ b/Libraries/oneCCL/oneCCL_Getting_Started/sample.json
@@ -0,0 +1,26 @@
+{
+ "guid": "C56209D9-5CF1-4EEC-AE95-596D81640AEB",
+ "name": "oneCCL Getting Started",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneCCL"],
+ "description": "Basic oneCCL programming model for both Intel CPU and GPU.",
+ "toolchain": ["dpcpp"],
+ "languages": [{"cpp":{}}],
+ "dependencies": ["ccl"],
+ "os": ["linux"],
+ "builder": ["cli","cmake"],
+ "targetDevice": ["CPU", "GPU"],
+ "ciTests": {
+	"linux": [{
+		"env": ["source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu_gpu_dpcpp --force" ],
+		"id": "gsg",
+		"steps": [
+			"mkdir build",
+      		        "cd build",
+           		"cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp",
+           		"make",
+			"mpirun -n 2 ./out/sycl/sycl_allreduce_cpp_test cpu",
+			"mpirun -n 2 ./out/sycl/sycl_allreduce_cpp_test gpu"
+		 ]
+	}]
+ }
+}
diff --git a/Libraries/oneCCL/tutorials/README.md b/Libraries/oneCCL/tutorials/README.md
new file mode 100644
index 0000000000..79f7dcedac
--- /dev/null
+++ b/Libraries/oneCCL/tutorials/README.md
@@ -0,0 +1,18 @@
+# Intel oneAPI Collective Communications Library (oneCCL)
+
+Collective Communication Library is a library providing an efficient implementation of communication patterns usedin deep learning.
+
+Github : https://github.com/oneapi-src/oneCCL
+
+## License  
+The code samples are licensed under MIT license
+
+# oneCCL samples
+
+| Type      | Name                 | Description                                                  |
+| --------- | ----------------------- | ------------------------------------------------------------ |
+| Component | [oneCCL_Getting_Started](oneCCL_Getting_Started.ipynb) |This Jupyter Notebook demonstrates how to compile a oneCCL sample with different releases and how to port a oneCCL sample from CPU-only version to CPU&GPU version by using DPC++ via batch jobs on the Intel oneAPI DevCloud (check below Notice)|
+>  Notice : Please use Intel oneAPI DevCloud as the environment for jupyter notebook samples. \
+Users can refer to [DevCloud Getting Started](https://devcloud.intel.com/oneapi/get-started/) for using DevCloud \
+Users can use JupyterLab from DevCloud via "One-click Login in", and download samples via "git clone" or the "oneapi-cli" tool \
+Once users are in the JupyterLab with downloaded jupyter notebook samples, they can start following the steps without further installion needed.
diff --git a/Libraries/oneCCL/tutorials/oneCCL_Getting_Started.ipynb b/Libraries/oneCCL/tutorials/oneCCL_Getting_Started.ipynb
new file mode 100644
index 0000000000..e9f18415c6
--- /dev/null
+++ b/Libraries/oneCCL/tutorials/oneCCL_Getting_Started.ipynb
@@ -0,0 +1,1024 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 1.1 - port an Intel® oneAPI Collective Communications Library (oneCCL) sample from CPU to GPU   -  CCL Allreduce  \n",
+    "\n",
+    "## Learning Objectives\n",
+    "In this module, the developer will:\n",
+    "* Learn different oneCCL configurations inside the Intel® oneAPI toolkit\n",
+    "* Learn how to compile a oneCCL sample with different configurations via batch jobs on the Intel® DevCloud for oneAPI or in local environments\n",
+    "* Learn how to program oneCCL with a simple sample\n",
+    "* Learn how to port a oneCCL sample from CPU-only version to CPU&GPU version by using DPC++\n",
+    "* Learn how to collect VTune™ Amplifier data for CPU and GPU runs\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# CCL Allreduce CPU to GPU porting Exercise\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1 : introduce oneCCL configurations inside oneAPI toolkits\n",
+    "oneCCL has two different configurations inside the oneAPI toolkits. Both lib and include folders under the oneCCL installation path contain two different configurations, and each configuration supports a different compiler."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the installation path of your oneAPI toolkit:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ONEAPI_INSTALL=/opt/intel/inteloneapi/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!printf '%s\\n'    $ONEAPI_INSTALL/ccl/latest/lib/*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you can see, there are two different folders under the oneCCL installation path, and each of those configurations supports different features. \n",
+    "This tutorial will guide you on how to compile and run against different oneCCL configurations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, create a lab folder for this exercise:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir lab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Step 2 : Editing the cpu_allreduce_cpp_test.cpp code which only supports CPU\n",
+    "\n",
+    "This C++ API example demonstrates how to build a global reduction operation by using the sum function, and it can run only on CPU.\n",
+    "You can find a detailed allreduce API explanation at this [link](https://intel.github.io/oneccl/spec/communication_primitives.html#allreduce)\n",
+    "\n",
+    "\n",
+    "The Jupyter cell below with the gray background can be edited in-place and saved.\n",
+    "The first line of the cell contains the command **%%writefile ' lab/cpu_allreduce_cpp_test.cpp'** This tells the input cell to save the contents of the cell into the file name ' cpu_allreduce_cpp_test.cpp'  As you edit the cell and run it, it will save your changes into that file.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile lab/cpu_allreduce_cpp_test.cpp\n",
+    "#include <iostream>\n",
+    "#include <stdio.h>\n",
+    "#include \"ccl.hpp\"\n",
+    "\n",
+    "#define COUNT 128\n",
+    "\n",
+    "using namespace std;\n",
+    "\n",
+    "int main(int argc, char** argv)\n",
+    "{\n",
+    "    int i = 0;\n",
+    "    int size = 0;\n",
+    "    int rank = 0;\n",
+    "\n",
+    "    auto sendbuf = new int[COUNT];\n",
+    "    auto recvbuf = new int[COUNT];\n",
+    "\n",
+    "    auto comm = ccl::environment::instance().create_communicator();\n",
+    "    auto stream = ccl::environment::instance().create_stream();\n",
+    "\n",
+    "    rank = comm->rank();\n",
+    "    size = comm->size();\n",
+    "\n",
+    "    /* initialize sendbuf */\n",
+    "    for (i = 0; i < COUNT; i++) {\n",
+    "        sendbuf[i] = rank;\n",
+    "    }\n",
+    "\n",
+    "    /* modify sendbuf */\n",
+    "    for (i = 0; i < COUNT; i++) {\n",
+    "        sendbuf[i] += 1;\n",
+    "    }\n",
+    "\n",
+    "    /* invoke ccl_allreduce */\n",
+    "    comm->allreduce(sendbuf,\n",
+    "                   recvbuf,\n",
+    "                   COUNT,\n",
+    "                   ccl::reduction::sum,\n",
+    "                   nullptr, /* attr */\n",
+    "                   stream)->wait();\n",
+    "\n",
+    "    /* check correctness of recvbuf */\n",
+    "    for (i = 0; i < COUNT; i++) {\n",
+    "        if (recvbuf[i] != size * (size + 1) / 2) {\n",
+    "           recvbuf[i] = -1;\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    /* print out the result of the test */\n",
+    "    if (rank == 0) {\n",
+    "        for (i = 0; i < COUNT; i++) {\n",
+    "            if (recvbuf[i] == -1) {\n",
+    "                cout << \"FAILED\" << endl;\n",
+    "                break;\n",
+    "            }\n",
+    "        }\n",
+    "        if (i == COUNT) {\n",
+    "            cout << \"PASSED\" << endl;\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    return 0;\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, copy the required CMake file into lab folder. The top half of CMakeList.txt handles CPU-only samples, and the bottom half handles DPC++ samples with CPU and GPU support."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile lab/CMakeLists.txt\n",
+    "#cmake_minimum_required (VERSION 2.8)\n",
+    "#project(CCL_SAMPLES)\n",
+    "set(CCL_TEST_INCLUDE_DIR \"$ENV{PWD}/../include\")\n",
+    "set(CMAKE_INSTALL_PREFIX \"$ENV{PWD}/_install\")\n",
+    "if(${CMAKE_CXX_COMPILER_ID} STREQUAL \"GNU\")\n",
+    "    file(GLOB sources \"cpu_*.c\" \"cpu_*.cpp\")\n",
+    "    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} ${CMAKE_CLANG_FLAGS} -std=c++11\")\n",
+    "    set(CCL_INCLUDE_DIR \"$ENV{CCL_ROOT}/include/cpu_icc\")\n",
+    "    set(CCL_LIB_DIR \"$ENV{CCL_ROOT}/lib/cpu_icc\")\n",
+    "    foreach(src ${sources})\n",
+    "        include_directories(${CCL_INCLUDE_DIR})\n",
+    "        include_directories(${CCL_TEST_INCLUDE_DIR})\n",
+    "        link_directories(${CCL_LIB_DIR})\n",
+    "        get_filename_component(executable ${src} NAME_WE)\n",
+    "        add_executable(${executable} ${src})\n",
+    "        target_link_libraries(${executable} PUBLIC rt)\n",
+    "        target_link_libraries(${executable} PUBLIC m)\n",
+    "        target_link_libraries(${executable} PRIVATE ccl)\n",
+    "        target_link_libraries(${executable} PUBLIC pthread dl stdc++)\n",
+    "        install(TARGETS ${executable} RUNTIME DESTINATION \"${CMAKE_INSTALL_PREFIX}\")\n",
+    "    endforeach()\n",
+    "endif()\n",
+    "\n",
+    "if(${CMAKE_CXX_COMPILER_ID} STREQUAL \"Clang\")\n",
+    "    set(CCL_INCLUDE_DIRS \"${CCL_INCLUDE_DIRS} $ENV{SYCL_BUNDLE_ROOT}/include\")\n",
+    "    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -fsycl -std=c++11\")\n",
+    "    file(GLOB sources \"sycl_*.c\" \"sycl_*.cpp\")\n",
+    "    set(CCL_INCLUDE_DIR \"$ENV{CCL_ROOT}/include/cpu_gpu_dpcpp\")\n",
+    "    set(CCL_LIB_DIR \"$ENV{CCL_ROOT}/lib/cpu_gpu_dpcpp\")\n",
+    "    foreach(src ${sources})\n",
+    "        include_directories(${CCL_INCLUDE_DIR})\n",
+    "        include_directories(${CCL_TEST_INCLUDE_DIR})\n",
+    "        link_directories(${CCL_LIB_DIR})\n",
+    "        get_filename_component(executable ${src} NAME_WE)\n",
+    "        add_executable(${executable} ${src})\n",
+    "        target_link_libraries(${executable} PUBLIC rt)\n",
+    "        target_link_libraries(${executable} PUBLIC m)\n",
+    "        target_link_libraries(${executable} PRIVATE ccl)\n",
+    "        target_link_libraries(${executable} PRIVATE OpenCL)\n",
+    "        target_link_libraries(${executable} PRIVATE sycl)\n",
+    "        install(TARGETS ${executable} RUNTIME DESTINATION \"${CMAKE_INSTALL_PREFIX}\")\n",
+    "    endforeach()\n",
+    "endif()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step3:  Build and Execution\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Build and Run with GNU Compiler and OpenMP\n",
+    "The global reduction operations by using sum function sample uses the GNU compiler for this CPU. The following section guides you on how to build with G++ and run on a CPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler  command and flags that will generate the executable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --ccl-configuration=cpu_icc --force > /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir cpu_gomp\n",
+    "cd cpu_gomp\n",
+    "cmake .. -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++\n",
+    "make cpu_allreduce_cpp_test\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the Intel DevCloud or in local environments.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "the script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "The user must switch to the g++ oneCCL configuration by inputting a custom configuration \"--ccl-configuration=cpu_icc\" when running \"source setvars.sh\".\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --ccl-configuration=cpu_icc --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "./cpu_gomp/out/cpu_allreduce_cpp_test\n",
+    "echo \"########## Done with the run\"\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit the **build.sh** and **run.sh** to the job queue.\n",
+    "\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts both on the DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails, it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "!rm -rf cpu_gomp; chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4: Analyze performance with VTune Amplifier\n",
+    "Use the VTune Amplifier command line to analyze performance and display the summary."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### do CPU profiling first\n",
+    "The script vtune_collect.sh encapsulates the profiling command and flags that will generate the VTune Amplifier profiling results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile vtune_collect.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --ccl-configuration=cpu_icc --force \n",
+    "type=hotspots\n",
+    "\n",
+    "rm -r $(pwd)/vtune_data\n",
+    "\n",
+    "echo \"VTune Collect $type\"\n",
+    "vtune -collect $type -result-dir $(pwd)/vtune_data $(pwd)/cpu_gomp/out/cpu_allreduce_cpp_test\n",
+    "\n",
+    "echo \"VTune Summary Report\"\n",
+    "vtune -report summary -result-dir $(pwd)/vtune_data -format html -report-output $(pwd)/summary.html\n",
+    "echo \"Done profiling\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run VTune Amplifier to Collect Hotspots and Generate Report\n",
+    "Collect VTune Amplifier data and generate report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! chmod 755 vtune_collect.sh; if [ -x \"$(command -v qsub)\" ]; then ./q vtune_collect.sh; else ./vtune_collect.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### DisplayVTune Amplifier Summary\n",
+    "Display VTune Amplifier summary report generated in HTML format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import IFrame\n",
+    "IFrame(src='summary.html', width=960, height=600)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### do GPU profiling \n",
+    "The script vtune_collect.sh encapsulates the profiling command and flags that will generate the VTune Amplifier profiling results."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The profiling type is changed from hotspots to gpu-hotspots in below script to do basic GPU profiling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile vtune_collect.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --ccl-configuration=cpu_icc --force \n",
+    "type=gpu-hotspots\n",
+    "\n",
+    "rm -r $(pwd)/vtune_data\n",
+    "\n",
+    "echo \"VTune Collect $type\"\n",
+    "vtune -collect $type -result-dir $(pwd)/vtune_data $(pwd)/cpu_gomp/out/cpu_allreduce_cpp_test\n",
+    "\n",
+    "echo \"VTune Summary Report\"\n",
+    "vtune -report summary -result-dir $(pwd)/vtune_data -format html -report-output $(pwd)/summary-gpu.html\n",
+    "echo \"Done profiling\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run VTune Amplifier to Collect Hotspots and Generate Report\n",
+    "Collect VTune Amplifier data and generate report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! chmod 755 vtune_collect.sh; if [ -x \"$(command -v qsub)\" ]; then ./q vtune_collect.sh; else ./vtune_collect.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Display VTune Amplifier Summary\n",
+    "Display the VTune Amplifier summary report generated in HTML format"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the VTune Amplifier summary page, the GPU is stalled/idle all the time. This sample does not utilize GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import IFrame\n",
+    "IFrame(src='summary-gpu.html', width=960, height=600)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Step 5 : Modifying the cpu_allreduce_cpp_test.cpp code which supports both CPU and GPU\n",
+    "\n",
+    "In this session, we will convert the above sycl_allreduce_cpp_test.cpp to support both CPU and GPU and compile the sample with DPC++ instead of g++.\n",
+    "\n",
+    "There are several steps to complete the code conversion from CPU to GPU for this sample.\n",
+    "\n",
+    "* Step 0 : Define inline functions to create sycl queue with the selected selector\n",
+    "* Step 1 : Declare the sycl queue and sycl buffers\n",
+    "* Step 2 : Use the inline functions in Step 0 to create the sycl queue\n",
+    "* Step 3 : Access sycl buffer via its accessor on both the host and target side \n",
+    "* Step 3.1 : Initialize sycl buffer and its acccessor on the host side\n",
+    "* Step 3.2 : Modify sycl buffer via its accessor on the target device side \n",
+    "* Step 3.3 : Check sycl buffer's correctness on the target device side \n",
+    "* Step 3.4 : Check sycl buffer's correctness on the host side\n",
+    "\n",
+    "You can find related modifications below in sycl_allreduce_cpp_test.cpp, and the modifications for each step are wrapped up with \">>>>>>\" and \"<<<<<<\".\n",
+    "\n",
+    "**_NOTE:_** Host Accessors: The constructor for a host accessor waits for all kernels that modify the same buffer (or\n",
+    "image) in any queues to complete and then copies data back to host memory before the constructor returns.\n",
+    "Any command groups with requirements to the same memory object cannot execute until the host accessor\n",
+    "is destroyed. **Therefore, we must have { } for Step 3.1**\n",
+    "\n",
+    "There are two files in this DPC++ allreduce sample:\n",
+    "* sycl_base.hpp\n",
+    "* sycl_allreduce_cpp_test.cpp\n",
+    "\n",
+    "sycl_base.hpp contains inline functions to create sycl queue with the selected selector, and main program is in sycl_allreduce_cpp_test.cpp.\n",
+    "\n",
+    "The Jupyter cell below with the gray background can be edited in-place and saved.\n",
+    "The first line of the cell contains the command **%%writefile ' lab/sycl_base.hpp' '** This tells the input cell to save the contents of the cell into the file name '  lsycl_base.hpp'  As you edit the cell and run it, it will save your changes into that file.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### lab/sycl_base.hpp\n",
+    "header file for inline functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile lab/sycl_base.hpp\n",
+    "#include <iostream>\n",
+    "#include <stdio.h>\n",
+    "\n",
+    "// ------ GPU code conversion --Step 0 >>>>>>\n",
+    "// Define inline functions to create sycl queue with the selected selector\n",
+    "#include <CL/sycl.hpp>\n",
+    "#include \"ccl.hpp\"\n",
+    "\n",
+    "using namespace std;\n",
+    "\n",
+    "using namespace cl::sycl;\n",
+    "using namespace cl::sycl::access;\n",
+    "\n",
+    "inline bool has_gpu()\n",
+    "{\n",
+    "    std::vector<cl::sycl::device> devices = cl::sycl::device::get_devices();\n",
+    "    for (const auto& device : devices)\n",
+    "    {\n",
+    "        if (device.is_gpu())\n",
+    "        {\n",
+    "            return true;\n",
+    "        }\n",
+    "    }\n",
+    "    return false;\n",
+    "}\n",
+    "\n",
+    "inline int create_sycl_queue(int argc, char **argv, cl::sycl::queue &queue)\n",
+    "{\n",
+    "    unique_ptr<cl::sycl::device_selector> selector;\n",
+    "    if (argc == 2)\n",
+    "    {\n",
+    "        if (strcmp(argv[1], \"cpu\") == 0)\n",
+    "        {\n",
+    "            selector.reset(new cl::sycl::cpu_selector());\n",
+    "        }\n",
+    "        else if (strcmp(argv[1], \"gpu\") == 0)\n",
+    "        {\n",
+    "            if (has_gpu()) \n",
+    "            {\n",
+    "                selector.reset(new cl::sycl::gpu_selector());\n",
+    "            }\n",
+    "            else\n",
+    "            {\n",
+    "                selector.reset(new cl::sycl::default_selector());\n",
+    "                cout << \"GPU is unavailable, default_selector has been created instead of gpu_selector.\" << std::endl;\n",
+    "            }\n",
+    "        }\n",
+    "        else if (strcmp(argv[1], \"host\") == 0)\n",
+    "        {\n",
+    "            selector.reset(new cl::sycl::host_selector());\n",
+    "        }\n",
+    "        else if (strcmp(argv[1], \"default\") == 0)\n",
+    "        {\n",
+    "            selector.reset(new cl::sycl::host_selector());\n",
+    "               cout << \"Accelerator is unavailable for multiprocessing, host_selector has been created instead of default_selector.\" << std::endl;\n",
+    "         }\n",
+    "        else\n",
+    "        {\n",
+    "            cerr << \"Please provide device type: cpu | gpu | host | default \" << std::endl;\n",
+    "            return -1;\n",
+    "        }\n",
+    "        queue = cl::sycl::queue(*selector);\n",
+    "        cout << \"Provided device type \" << argv[1] << \"\\nRunning on \"\n",
+    "                  << queue.get_device().get_info<cl::sycl::info::device::name>()\n",
+    "                  << \"\\n\";\n",
+    "    }\n",
+    "    else\n",
+    "    {\n",
+    "        cerr << \"Please provide device type: cpu | gpu | host | default \" << std::endl;\n",
+    "        return -1;\n",
+    "    }\n",
+    "    return 0;\n",
+    "}\n",
+    "                \n",
+    "//<<<<<< ------ GPU code conversion --Step 0     "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### lab/sycl_allreduce_cpp_test.cpp\n",
+    "Implementation of SYCL allreduce functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Jupyter cell below with the gray background can be edited in-place and saved.\n",
+    "The first line of the cell contains the command **%%writefile ' lab/sycl_allreduce_cpp_test.cpp' '** This tells the input cell to save the contents of the cell into the file name ' sycl_allreduce_cpp_test.cpp'  As you edit the cell and run it, it will save your changes into that file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile lab/sycl_allreduce_cpp_test.cpp\n",
+    "// ------ GPU code conversion --Step 0 >>>>>>\n",
+    "#include \"sycl_base.hpp\"\n",
+    "//<<<<<< ------ GPU code conversion --Step 0     \n",
+    "#define COUNT 128\n",
+    "\n",
+    "int main(int argc, char** argv)\n",
+    "{\n",
+    "    int i = 0;\n",
+    "    int size = 0;\n",
+    "    int rank = 0;\n",
+    "\n",
+    "    // ------ GPU code conversion --Step 1 >>>>>>\n",
+    "    // Declare the sycl queue and sycl buffers\n",
+    "    cl::sycl::queue q;\n",
+    "    cl::sycl::buffer<int, 1> sendbuf(COUNT);\n",
+    "    cl::sycl::buffer<int, 1> recvbuf(COUNT);\n",
+    "    //<<<<<< ------ GPU code conversion --Step 1    \n",
+    "    \n",
+    "    // ------ GPU code conversion --Step 2 >>>>>>\n",
+    "    // Use inline functions in Step 0 to create the sycl queue\n",
+    "    if (create_sycl_queue(argc, argv, q) != 0) {\n",
+    "        return -1;\n",
+    "    }\n",
+    "    //<<<<<< ------ GPU code conversion --Step 2\n",
+    "    \n",
+    "    auto comm = ccl::environment::instance().create_communicator();\n",
+    "    auto stream = ccl::environment::instance().create_stream();\n",
+    "\n",
+    "    rank = comm->rank();\n",
+    "    size = comm->size();\n",
+    "\n",
+    "    /* initialize sendbuf and recvbuf*/\n",
+    "    // ------ GPU code conversion --Step 3.1 >>>>>>\n",
+    "    {\n",
+    "        //  open buffers and initialize them on the CPU side \n",
+    "        auto host_acc_sbuf = sendbuf.get_access<mode::write>();\n",
+    "        auto host_acc_rbuf = recvbuf.get_access<mode::write>();\n",
+    "        for (i = 0; i < COUNT; i++) {\n",
+    "            host_acc_sbuf[i] = rank;\n",
+    "            host_acc_rbuf[i] = -1;\n",
+    "        }\n",
+    "    }\n",
+    "    //<<<<<< ------ GPU code conversion --Step 3.1\n",
+    "\n",
+    "    /* modify sendbuf */\n",
+    "    // ------ GPU code conversion --Step 3.2 >>>>>>\n",
+    "    // open sendbuf and modify it on the target device side \n",
+    "    q.submit([&](handler& cgh){\n",
+    "       auto dev_acc_sbuf = sendbuf.get_access<mode::write>(cgh);\n",
+    "       cgh.parallel_for<class allreduce_test_sbuf_modify>(range<1>{COUNT}, [=](item<1> id) {\n",
+    "           dev_acc_sbuf[id] += 1;\n",
+    "       });\n",
+    "    });\n",
+    "    //<<<<<< ------ GPU code conversion --Step 3.2\n",
+    "    \n",
+    "    /* invoke ccl_allreduce */\n",
+    "    comm->allreduce(sendbuf,\n",
+    "                   recvbuf,\n",
+    "                   COUNT,\n",
+    "                   ccl::reduction::sum,\n",
+    "                   nullptr, /* attr */\n",
+    "                   stream)->wait();\n",
+    "\n",
+    "    \n",
+    "    \n",
+    "    /* check correctness of recvbuf */\n",
+    "    // ------ GPU code conversion --Step 3.3 >>>>>>\n",
+    "    // open recvbuf and check its correctness on the target device side \n",
+    "    q.submit([&](handler& cgh){\n",
+    "       auto dev_acc_rbuf = recvbuf.get_access<mode::write>(cgh);\n",
+    "       cgh.parallel_for<class allreduce_test_rbuf_check>(range<1>{COUNT}, [=](item<1> id) {\n",
+    "           if (dev_acc_rbuf[id] != size*(size+1)/2) {\n",
+    "               dev_acc_rbuf[id] = -1;\n",
+    "           }\n",
+    "       });\n",
+    "    });\n",
+    "    //<<<<<< ------ GPU code conversion --Step 3.3\n",
+    "    \n",
+    "    /* print out the result of the test */\n",
+    "    if (rank == 0) {\n",
+    "        // ------ GPU code conversion --Step 3.4 >>>>>>\n",
+    "        // open buffers and validate them on the CPU side \n",
+    "        auto host_acc_rbuf_new = recvbuf.get_access<mode::read>();\n",
+    "        for (i = 0; i < COUNT; i++) {\n",
+    "            if (host_acc_rbuf_new[i] == -1) {\n",
+    "        //<<<<<< ------ GPU code conversion --Step 3.4\n",
+    "                cout << \"FAILED\" << std::endl;\n",
+    "                break;\n",
+    "            }\n",
+    "        }\n",
+    "        if (i == COUNT) {\n",
+    "            cout << \"PASSED\" << std::endl;\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    return 0;\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Build and Run with the DPC++ Compiler\n",
+    "For this global reduction operation sample on GPU and CPU, DPC++ is used as the compiler.\n",
+    "The following section guides you how to build with DPC++ and run on GPU and CPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler  command and flags that will generate the executable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --ccl-configuration=cpu_gpu_dpcpp --force > /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir dpcpp\n",
+    "cd dpcpp\n",
+    "cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp\n",
+    "make sycl_allreduce_cpp_test\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, execute your program on the DevCloud or in local environments.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "The script **run.sh** encapsulates the program for submission to the job queue for execution.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --ccl-configuration=cpu_gpu_dpcpp --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "./dpcpp/out/sycl_allreduce_cpp_test gpu\n",
+    "echo \"########## Done with the run\"\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit the **build.sh** and **run.sh** to the job queue.\n",
+    "\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts both on the Intel DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf dpcpp; chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 6: Analyze performance with VTune Amplifier\n",
+    "Use the VTune Amplifier command line to analyze performace and display the summary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### do CPU profiling first. \n",
+    "The script vtune_collect.sh encapsulates the profiling command and flags that will generate the VTune Amplifier profiling results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile vtune_collect.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --ccl-configuration=cpu_gpu_dpcpp --force\n",
+    "type=hotspots\n",
+    "\n",
+    "rm -r $(pwd)/vtune_data\n",
+    "\n",
+    "echo \"VTune Collect $type\"\n",
+    "vtune -collect $type -result-dir vtune_data $(pwd)/dpcpp/out/sycl_allreduce_cpp_test cpu\n",
+    "\n",
+    "echo \"VTune Summary Report\"\n",
+    "vtune -report summary -result-dir $(pwd)/vtune_data -format html -report-output $(pwd)/summary.html\n",
+    "echo \"Done profiling\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run VTune Amplifier to Collect Hotspots and Generate Report\n",
+    "Collect VTune Amplifier data and generate report:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! chmod 755 vtune_collect.sh; if [ -x \"$(command -v qsub)\" ]; then ./q vtune_collect.sh; else ./vtune_collect.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Display VTune Amplifier Summary\n",
+    "Display the VTune Amplifier summary report generated in HTML format:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import IFrame\n",
+    "IFrame(src='summary.html', width=960, height=600)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### do GPU profiling \n",
+    "The script vtune_collect.sh encapsulates the profiling command and flags that will generate the VTune Amplifier profiling results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile vtune_collect.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --ccl-configuration=cpu_gpu_dpcpp --force\n",
+    "type=gpu-hotspots\n",
+    "\n",
+    "rm -r $(pwd)/vtune_data\n",
+    "\n",
+    "echo \"VTune Collect $type\"\n",
+    "vtune -collect $type -result-dir $(pwd)/vtune_data $(pwd)/dpcpp/out/sycl_allreduce_cpp_test gpu\n",
+    "\n",
+    "\n",
+    "echo \"VTune Summary Report\"\n",
+    "vtune -report summary -result-dir $(pwd)/vtune_data -format html -report-output $(pwd)/summary-gpu.html\n",
+    "echo \"Done profiling\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run VTune Amplifier to Collect Hotspots and Generate Report\n",
+    "Collect VTune Amplifier data and generate report:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! chmod 755 vtune_collect.sh; if [ -x \"$(command -v qsub)\" ]; then ./q vtune_collect.sh; else ./vtune_collect.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Display VTune Amplifier Summary\n",
+    "Display the VTune Amplifier summary report generated in HTML format:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import IFrame\n",
+    "IFrame(src='summary-gpu.html', width=960, height=600)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here are the supported profiling types from VTune Amplifier.\n",
+    "\n",
+    "* type=hotspots\n",
+    "* type=memory-consumption\n",
+    "* type=uarch-exploration\n",
+    "* type=memory-access\n",
+    "* type=threading\n",
+    "* type=hpc-performance\n",
+    "* type=system-overview\n",
+    "* type=graphics-rendering\n",
+    "* type=io\n",
+    "* type=fpga-interaction\n",
+    "* type=gpu-offload\n",
+    "* type=gpu-hotspots\n",
+    "* type=throttling\n",
+    "* type=platform-profiler\n",
+    "* type=cpugpu-concurrency\n",
+    "* type=tsx-exploration\n",
+    "* type=tsx-hotspots\n",
+    "* type=sgx-hotspots\n",
+    "\n",
+    "For details of VTune Amplifier usage, please refer to https://software.intel.com/en-us/oneapi/vtune-profiler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# Summary\n",
+    "In this lab the developer will have learned the following:\n",
+    "* Know different oneCCL configurations inside oneAPI toolkit\n",
+    "* Know how to compile a oneCCL sample with different configurations via batch jobs on the Intel oneAPI DevCloud or in local environments\n",
+    "* Know how to program oneCCL with a simple sample\n",
+    "* Know how to port a oneCCL sample from CPU-only version to CPU&GPU version by using DPC++\n",
+    "* Know how to collect VTune Amplifier data for CPU and GPU runs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "525.6px",
+    "left": "28px",
+    "top": "137.8px",
+    "width": "301.109px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Libraries/oneCCL/tutorials/q b/Libraries/oneCCL/tutorials/q
new file mode 100755
index 0000000000..7905552a6f
--- /dev/null
+++ b/Libraries/oneCCL/tutorials/q
@@ -0,0 +1,32 @@
+#!/bin/bash
+#========================================
+# Script to submit job in Intel devcloud
+#
+# Version: 0.5
+#========================================
+if [ -z "$1" ]; then
+	echo "Missing script argument, Usage: ./q run.sh"
+elif [ ! -f "$1" ]; then
+    echo "File $1 does not exist"
+else
+	script=$1
+	rm *.sh.* > /dev/null 2>&1
+	#qsub
+	echo "Submitting job:"
+	qsub -l nodes=1:gpu:ppn=2 -d . $script
+	#qstat
+	qstat 
+	#wait for output file to be generated and display
+	echo -ne "Waiting for Output."
+	until [ -f $script.o* ]; do
+		sleep 1
+		echo -ne "."
+		((timeout++))
+		if [ $timeout == 60 ]; then
+			echo "TimeOut 60 seconds: Job is still queued for execution, check for output file later (*.sh.o)"
+			break
+		fi
+	done
+	cat $script.o*
+	cat $script.e*
+fi
diff --git a/Libraries/oneCCL/tutorials/sample.json b/Libraries/oneCCL/tutorials/sample.json
new file mode 100644
index 0000000000..23d4b492c6
--- /dev/null
+++ b/Libraries/oneCCL/tutorials/sample.json
@@ -0,0 +1,21 @@
+{
+ "guid": "70FAEF03-6509-4B6C-B995-D42A65CE36EE",
+ "name": "oneCCL Tutorials",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneCCL"],
+ "description": "oneCCL tutorials.",
+ "toolchain": ["dpcpp"],
+ "languages": [{"cpp":{}}],
+ "dependencies": ["ccl"],
+ "os": ["linux"],
+ "builder": ["cli","cmake"],
+ "targetDevice": ["CPU", "GPU"],
+ "ciTests": {
+	"linux": [{
+		"env": ["source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu_gpu_dpcpp --force" ],
+		"id": "ccl gsg",
+		"steps": [
+			"runipy oneCCL_Getting_Started.ipynb"
+		 ]
+	}]
+ }
+}
diff --git a/Libraries/oneDNN/README.md b/Libraries/oneDNN/README.md
new file mode 100644
index 0000000000..825dd1e156
--- /dev/null
+++ b/Libraries/oneDNN/README.md
@@ -0,0 +1,32 @@
+# oneAPI Deep Neural Network Library (oneDNN)
+
+oneAPI Deep Neural Network Library (oneDNN) is an open-source performance
+library for deep learning applications. The library includes basic building
+blocks for neural networks optimized for Intel Architecture Processors
+and Intel Processor Graphics. oneDNN is intended for deep learning
+applications and framework developers interested in improving application
+performance on Intel CPUs and GPUs.
+
+You can find library source code and code used by these samples at [oneDNN Github repository](https://github.com/oneapi-src/oneDNN).
+
+## License
+The code samples are licensed under MIT license.
+
+# oneDNN Samples
+
+| Type      | Name                                             | Description
+| --------- | ------------------------------------------------ | -
+| Component | [getting_started](getting_started)               | A C++ sample demonstrating basics of oneDNN programming model.
+| Component | [dpcpp_interoparibility](dpcpp_interoperability) | A DPC++ example demonstrating interoperaility of oneDNN with DPC++ application code.
+| Component | [simple_model](simple_model)                     | A C++ example demonstrating implmentation of simple convolutional model with oneDNN.
+| Component | [tutorials](tutorials)                           | Hands-on Jupyter notebook tutorials among different topics.
+
+# Using Samples in Intel oneAPI DevCloud
+
+You can use oneDNN samples in
+[Intel oneAPI DevCloud](https://devcloud.intel.com/oneapi/get-started/)
+environment in the following ways:
+* Login to a DevCloud system via SSH and
+  * use `git clone` to get a full copy of samples repository, or
+  * use `oneapi-cli` tool to download specific sample.
+* Launch a JupyterLab server and run Jupyter Notebooks from your web browser.
diff --git a/Libraries/oneDNN/dpcpp_interoperability/CMakeLists.txt b/Libraries/oneDNN/dpcpp_interoperability/CMakeLists.txt
new file mode 100644
index 0000000000..4327bfb627
--- /dev/null
+++ b/Libraries/oneDNN/dpcpp_interoperability/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 2.8.11)
+if("${CMAKE_CXX_COMPILER}" STREQUAL "")
+    set(CMAKE_C_COMPILER "clang")
+    set(CMAKE_CXX_COMPILER "dpcpp")
+endif()
+project (dpcpp_interoperability)
+file(COPY $ENV{DNNLROOT}/examples/sycl_interop.cpp DESTINATION src)
+file(COPY $ENV{DNNLROOT}/examples/CMakeLists.txt DESTINATION src)
+if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
+    file(COPY $ENV{DNNLROOT}/examples/template.vcxproj.user DESTINATION src)
+endif()
+add_subdirectory (${PROJECT_BINARY_DIR}/src bin)
diff --git a/Libraries/oneDNN/dpcpp_interoperability/License.txt b/Libraries/oneDNN/dpcpp_interoperability/License.txt
new file mode 100644
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/Libraries/oneDNN/dpcpp_interoperability/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Libraries/oneDNN/dpcpp_interoperability/README.md b/Libraries/oneDNN/dpcpp_interoperability/README.md
new file mode 100644
index 0000000000..18411eba91
--- /dev/null
+++ b/Libraries/oneDNN/dpcpp_interoperability/README.md
@@ -0,0 +1,128 @@
+# oneDNN DPC++ Interoperability Sample
+
+This C++ API example demonstrates programming for Intel(R) Processor Graphics with SYCL extensions API in oneDNN. 
+Users will know how to access SYCL buffer and queue via oneDNN SYCL interoperability interfaces,  
+and this interface also helps users to execute a custom SYCL kernel with oneDNN library.
+
+| Optimized for                      | Description
+| :---                               | :---
+| OS                                 | Linux Ubuntu 18.04;
+| Hardware                           | Kaby Lake with GEN9 or newer
+| Software                           | Intel oneAPI Deep Neural Network Library (oneDNN), Intel oneAPI DPC++/C++ Compiler, Intel oneAPI Threading Building Blocks (oneTBB)
+| What you will learn                | Using oneDNN in DPC++ application targeting Intel CPU or Intel GPU
+| Time to complete                   | 15 minutes
+
+## Purpose
+
+This sample demonstrates programming for Intel(R) Processor Graphics with SYCL extensions API in oneDNN.
+
+With this sample you will learn:
+* How to create a GPU or CPU engine.
+* How to create a memory descriptor/object.
+* How to create a SYCL kernel for data initialization.
+* How to access a SYCL buffer via SYCL interoperability interface.
+* How to access a SYCL queue via SYCL interoperability interface.
+* How to execute a SYCL kernel with related SYCL queue and SYCL buffer
+* How to create operation descriptor/operation primitives descriptor/primitive.
+* How to execute the primitive with the initialized memory.
+* How to validate the result through a host accessor.
+
+The sample executes on system's CPU by default and can be executed on Intel GPU
+using a command line parameter `gpu`.
+
+## Key Implementation Details
+
+This sample uses example file `${DNNLROOT}/examples/sycl_interop.cpp`
+from oneDNN distribution. You can find this code in
+[oneDNN Github repository](https://github.com/oneapi-src/oneDNN/blob/dev-v2/examples/sycl_interop.cpp).
+
+Detailed code walkthrough is available in [oneDNN developer guide](https://oneapi-src.github.io/oneDNN/v2/sycl_interop_cpp.html)
+
+## License
+
+This code sample is licensed under MIT license.
+
+## Building the sample for CPU and GPU
+
+### On a Linux System
+
+Perform the following steps:
+1. Setup oneAPI development environment
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh
+```
+2. Build the program using `cmake`
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+3. Run the program
+```
+./bin/sycl-interop-cpp
+```
+
+### On a Windows* System Using Visual Studio* Version 2017 or Newer
+
+Open "x64 Native Tools Command Prompt for VS2017" or 
+"x64 Native Tools Command Prompt for VS2019" and perform the following steps:
+1. Setup oneAPI development environment
+```
+C:\Program Files (x86)\intel\oneapi\setvars.bat
+```
+2. Build the program using `cmake`
+```
+mkdir build
+cd build
+cmake -G "Visual Studio 16 2019" ..
+cmake --build .
+```
+
+> Note: You can open the `dpcpp_interoperability.sln` in build folder to edit source
+> code with Microsoft Visual Studio integrated development environment.
+
+3. Run the program
+```
+./bin/Debug/sycl-interop-cpp.exe
+```
+
+### Include Files
+
+The include folder is located at ${DNNLROOT}\include on your development system".
+
+## Running the Sample
+
+### Running Samples In DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/)
+
+### Application Parameters
+
+You can specify target device for this sample using command line arguments:
+* `cpu` (default) directs the application to run on system's CPU
+* `gpu` directs the sample to run on Intel GPU
+
+> Note: When executed with `gpu` parameter the 
+> sample will return an error if there are no Intel GPUs are found in the system.
+
+You can get additional information during execution of this sample by setting
+environment variable `DNNL_VERBOSE=1`.
+
+### Example of Output
+
+```
+Example passed on CPU.
+```
+
+When executed with `DNNL_VERBOSE=1`:
+```
+dnnl_verbose,info,oneDNN v1.95.0 (commit ae08a30fff7f76759fd4c5093c01707d0ee12c4c)
+dnnl_verbose,info,cpu,runtime:DPC++
+dnnl_verbose,info,cpu,isa:Intel AVX2
+dnnl_verbose,info,gpu,runtime:DPC++
+dnnl_verbose,info,cpu,engine,0,backend:OpenCL,name:Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz,driver_version:2020.10.7
+dnnl_verbose,info,gpu,engine,0,backend:Level Zero,name:Intel(R) Gen12LP,driver_version:0.8.0
+dnnl_verbose,exec,cpu,eltwise,jit:avx2,forward_training,data_f32::blocked:abcd:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,2x3x4x5,0.36499
+Example passed on CPU.
+```
+
diff --git a/Libraries/oneDNN/dpcpp_interoperability/sample.json b/Libraries/oneDNN/dpcpp_interoperability/sample.json
new file mode 100644
index 0000000000..59a8898574
--- /dev/null
+++ b/Libraries/oneDNN/dpcpp_interoperability/sample.json
@@ -0,0 +1,26 @@
+{
+ "guid": "EF50CE31-C467-4374-8BCC-4E5F93B4D1C1",
+ "name": "oneDNN SYCL Interop",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneDNN"],
+ "description": "oneDNN SYCL extensions API programming for both Intel CPU and GPU.",
+ "toolchain": ["dpcpp"],
+ "languages": [{"cpp":{}}],
+ "dependencies": ["oneDNN", "tbb"],
+ "os": ["linux"],
+ "builder": ["ide","cmake"],
+ "targetDevice": ["CPU", "GPU"],
+ "ciTests": {
+	"linux": [{
+		"env": ["source /opt/intel/oneapi/setvars.sh --dnnl-configuration=cpu_dpcpp_gpu_dpcpp --force" ],
+		"id": "interop",
+		"steps": [
+			"mkdir build",
+      		        "cd build",
+           		"cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp",
+           		"make sycl-interop-cpp",
+			"./bin/sycl-interop-cpp cpu",
+			"SYCL_BE=PI_OPENCL ./bin/sycl-interop-cpp gpu"
+		 ]
+	}]
+ }
+}
diff --git a/Libraries/oneDNN/getting_started/CMakeLists.txt b/Libraries/oneDNN/getting_started/CMakeLists.txt
new file mode 100644
index 0000000000..efbdd1aba1
--- /dev/null
+++ b/Libraries/oneDNN/getting_started/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 2.8.11)
+if("${CMAKE_CXX_COMPILER}" STREQUAL "")
+    set(CMAKE_C_COMPILER "clang")
+    set(CMAKE_CXX_COMPILER "dpcpp")
+endif()
+project (getting_started)
+file(COPY $ENV{DNNLROOT}/examples/getting_started.cpp DESTINATION src)
+file(COPY $ENV{DNNLROOT}/examples/CMakeLists.txt DESTINATION src)
+if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
+    file(COPY $ENV{DNNLROOT}/examples/template.vcxproj.user DESTINATION src)
+endif()
+add_subdirectory (${PROJECT_BINARY_DIR}/src bin)
diff --git a/Libraries/oneDNN/getting_started/License.txt b/Libraries/oneDNN/getting_started/License.txt
new file mode 100644
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/Libraries/oneDNN/getting_started/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Libraries/oneDNN/getting_started/README.md b/Libraries/oneDNN/getting_started/README.md
new file mode 100644
index 0000000000..9c573c2b36
--- /dev/null
+++ b/Libraries/oneDNN/getting_started/README.md
@@ -0,0 +1,152 @@
+# oneDNN Getting Started Sample
+
+oneAPI Deep Neural Network Library (oneDNN) is an open-source performance
+library for deep learning applications. The library includes basic building
+blocks for neural networks optimized for Intel Architecture Processors
+and Intel Processor Graphics. oneDNN is intended for deep learning
+applications and framework developers interested in improving application
+performance on Intel CPUs and GPUs.
+You can find library source code and code used by these samples at oneDNN Github repository.
+
+This sample is implemented in C++ and executes on CPU or GPU. The sample also
+also includes [a Jupyer Notebook](getting_started.ipynb) that
+demonstrates how to compile the code with various oneDNN configurations
+in Intel oneAPI DevCloud environment.
+
+| Optimized for                      | Description
+| :---                               | :---
+| OS                                 | Linux* Ubuntu* 18.04; Windows 10
+| Hardware                           | Skylake with GEN9 or newer
+| Software                           | Intel oneAPI Deep Neural Network Library (oneDNN), Intel oneAPI DPC++/C++ Compiler, Intel oneAPI Threading Building Blocks (oneTBB), GNU Compiler Collection, Intel C++ Compiler
+| What you will learn                | Running a simple convolutional model on Intel CPU or Intel GPU
+| Time to complete                   | 15 minutes
+
+## Purpose
+
+This sample demonstrates the basics of oneDNN programming model. With this
+sample you will learn:
+* How to create oneDNN memory objects.
+* How to get data from application buffer into a oneDNN memory object.
+* How tensor's logical dimensions and memory object formats relate.
+* How to create oneDNN primitives.
+* How to execute the primitives.
+
+The sample executes on system's CPU by default and can be executed on Intel GPU
+using a command line parameter `gpu`.
+
+## Key Implementation Details
+
+This sample uses example file `${DNNLROOT}/examples/getting_started.cpp`
+from oneDNN distribution. You can find this code in
+[oneDNN Github repository](https://github.com/oneapi-src/oneDNN/blob/dev-v2/examples/getting_started.cpp).
+
+Detailed code walkthrough is available in [oneDNN developer guide](https://oneapi-src.github.io/oneDNN/v2/getting_started.html)
+
+## License
+
+This code sample is licensed under MIT license.
+
+## Building the sample for CPU and GPU
+
+### On a Linux System
+
+Perform the following steps:
+1. Setup oneAPI development environment
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh
+```
+2. Build the program using `cmake`
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+3. Run the program
+```
+./bin/simple_model
+```
+
+By default the sample uses oneAPI DPC++/C++ Compiler and can execute on CPUs or
+Intel GPUs. You can build the sample with CPU support with other compilers
+and threading runtimes:
+* GNU C++ Compiler and GNU OpenMP runtime
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh --dnnl-configuration=cpu_gomp
+CC=GCC CXX=g++ cmake ..
+```
+* Intel C++ Compiler and Intel OpenMP runtime
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh --dnnl-configuration=cpu_iomp
+CC=icc CXX=icpc cmake ..
+```
+* Intel C++ Compiler and TBB runtime
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh --dnnl-configuration=cpu_tbb
+CC=icc CXX=icpc cmake ..
+```
+
+### On a Windows* System Using Visual Studio* Version 2017 or Newer
+
+Open "x64 Native Tools Command Prompt for VS2017" or 
+"x64 Native Tools Command Prompt for VS2019" and perform the following steps:
+1. Setup oneAPI development environment
+```
+C:\Program Files (x86)\intel\oneapi\setvars.bat
+```
+2. Build the program using `cmake`
+```
+mkdir build
+cd build
+cmake -G "Visual Studio 16 2019" ..
+cmake --build .
+```
+
+> Note: You can open the `getting_started.sln` in build folder to edit source
+> code with Microsoft Visual Studio integrated development environment.
+
+
+3. Run the program
+```
+./bin/Debug/getting_started.exe
+```
+
+### Include Files
+
+The include folder is located at ${DNNLROOT}\include on your development system".
+
+## Running the Sample
+
+### Running Samples In DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/)
+
+### Application Parameters
+
+You can specify target device for this sample using command line arguments:
+* `cpu` (default) directs the application to run on system's CPU
+* `gpu` directs the sample to run on Intel GPU
+
+> Note: When executed with `gpu` parameter the 
+> sample will return an error if the sample is compiled with oneDNN configuration
+> that does not support GPU or no Intel GPUs are found in the system.
+
+You can get additional information during execution of this sample by setting
+environment variable `DNNL_VERBOSE=1`.
+
+### Example of Output
+
+```
+Example passed on CPU.
+```
+
+When executed with `DNNL_VERBOSE=1`:
+```
+dnnl_verbose,info,oneDNN v1.95.0 (commit ae08a30fff7f76759fd4c5093c01707d0ee12c4c)
+dnnl_verbose,info,cpu,runtime:DPC++
+dnnl_verbose,info,cpu,isa:Intel AVX2
+dnnl_verbose,info,gpu,runtime:DPC++
+dnnl_verbose,info,cpu,engine,0,backend:OpenCL,name:Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz,driver_version:2020.10.7
+dnnl_verbose,info,gpu,engine,0,backend:Level Zero,name:Intel(R) Gen12LP,driver_version:0.8.0
+dnnl_verbose,exec,cpu,eltwise,jit:avx2,forward_inference,data_f32::blocked:acdb:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,1x3x13x13,0.125
+Example passed on CPU.
+```
diff --git a/Libraries/oneDNN/getting_started/sample.json b/Libraries/oneDNN/getting_started/sample.json
new file mode 100644
index 0000000000..1a31985f9c
--- /dev/null
+++ b/Libraries/oneDNN/getting_started/sample.json
@@ -0,0 +1,25 @@
+{
+ "guid": "028AE3ED-2896-4C56-9066-42AA5D5FA973",
+ "name": "oneDNN Getting Started",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneDNN"],
+ "description": "Basic oneDNN programming model for both Intel CPU and GPU.",
+ "toolchain": ["dpcpp"],
+ "languages": [{"cpp":{}}],
+ "dependencies": ["oneDNN", "tbb"],
+ "os": ["linux"],
+ "builder": ["ide","cmake"],
+ "targetDevice": ["CPU", "GPU"],
+ "ciTests": {
+	"linux": [{
+		"env": ["source /opt/intel/oneapi/setvars.sh --dnnl-configuration=cpu_dpcpp_gpu_dpcpp --force" ],
+		"id": "gsg",
+		"steps": [
+			"mkdir build",
+      		        "cd build",
+           		"cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp",
+           		"make getting-started-cpp",
+			"SYCL_BE=PI_OPENCL ./bin/getting-started-cpp gpu"
+		 ]
+	}]
+ }
+}
diff --git a/Libraries/oneDNN/simple_model/CMakeLists.txt b/Libraries/oneDNN/simple_model/CMakeLists.txt
new file mode 100644
index 0000000000..82baa34153
--- /dev/null
+++ b/Libraries/oneDNN/simple_model/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 2.8.11)
+if("${CMAKE_CXX_COMPILER}" STREQUAL "")
+   set(CMAKE_C_COMPILER "clang")
+   set(CMAKE_CXX_COMPILER "dpcpp")
+endif()
+project (simple_model)
+file(COPY $ENV{DNNLROOT}/examples/cnn_inference_f32.cpp DESTINATION src)
+file(COPY $ENV{DNNLROOT}/examples/CMakeLists.txt DESTINATION src)
+if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
+    file(COPY $ENV{DNNLROOT}/examples/template.vcxproj.user DESTINATION src)
+endif()
+add_subdirectory (${PROJECT_BINARY_DIR}/src bin)
diff --git a/Libraries/oneDNN/simple_model/License.txt b/Libraries/oneDNN/simple_model/License.txt
new file mode 100644
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/Libraries/oneDNN/simple_model/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Libraries/oneDNN/simple_model/README.md b/Libraries/oneDNN/simple_model/README.md
new file mode 100644
index 0000000000..cd84489858
--- /dev/null
+++ b/Libraries/oneDNN/simple_model/README.md
@@ -0,0 +1,156 @@
+# oneDNN Simple Model Sample
+
+This sample is implemented in C++ and DPC++ and runs on CPU or GPU. The sample
+also includes [a Jupyter Notebook](simple_model.ipynb) that
+demonstrates how to port a oneDNN sample from CPU-only version to CPU & GPU
+in Intel oneAPI DevCloud environment.
+
+| Optimized for                      | Description
+| :---                               | :---
+| OS                                 | Linux* Ubuntu* 18.04; Windows 10
+| Hardware                           | Skylake with GEN9 or newer
+| Software                           | Intel oneAPI Deep Neural Network Library (oneDNN), Intel oneAPI DPC++/C++ Compiler, Intel oneAPI Threading Building Blocks (oneTBB), GNU Compiler Collection, Intel C++ Compiler
+| What you will learn                | Running a simple convolutional model on Intel CPU or Intel GPU
+| Time to complete                   | 15 minutes
+
+## Purpose
+
+This sample implements computational part of a convolutional neural network
+based on [ImageNet Classification with Deep Convolutional Neural Networks by Alex Krizhevsky at al](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
+The network consists of 15 layers including convolution, rectified linear
+unit (ReLU), linear response normalization (LRN), and inner product.
+
+With this sample you will learn:
+* How to run a simple convolutional network on Intel CPU or Intel GPU
+* How to compile examples with Intel oneAPI DPC++/C++ Compiler, Intel C++ Compiler,
+and GNU C++ Compiler
+* How to switch between OpenMP and TBB for CPU parallelization
+* How to describe tensors with oneDNN memory objects
+* How to describe neural network layers with oneDNN primitives
+
+The sample executes on system's CPU by default and can be executed on Intel GPU
+using a command line parameter `gpu`.
+
+## Key Implementation Details
+
+This sample uses example file `${DNNLROOT}/examples/cnn_inference_fp32.cpp`
+from oneDNN distribution. You can find this code in
+[oneDNN Github repository](https://github.com/oneapi-src/oneDNN/blob/dev-v2/examples/cnn_inference_f32.cpp).
+
+Detailed code walkthrough is available in [oneDNN developer guide](https://oneapi-src.github.io/oneDNN/v2/cnn_inference_f32_cpp.html)
+
+## License
+
+This code sample is licensed under MIT license.
+
+## Building the sample for CPU and GPU
+
+### On a Linux System
+
+Perform the following steps:
+1. Setup oneAPI development environment
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh
+```
+2. Build the program using `cmake`
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+3. Run the program
+```
+./bin/cnn-inference-f32-cpp
+```
+
+By default the sample uses oneAPI DPC++/C++ Compiler and can execute on CPUs or
+Intel GPUs. You can build the sample with CPU support with other compilers
+and threading runtimes:
+* GNU C++ Compiler and GNU OpenMP runtime
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh --dnnl-configuration=cpu_gomp
+CC=GCC CXX=g++ cmake ..
+```
+* Intel C++ Compiler and Intel OpenMP runtime
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh --dnnl-configuration=cpu_iomp
+CC=icc CXX=icpc cmake ..
+```
+* Intel C++ Compiler and TBB runtime
+```
+source ${INTEL_ONEAPI_INSTALL_FOLDER}/setvars.sh --dnnl-configuration=cpu_tbb
+CC=icc CXX=icpc cmake ..
+```
+
+### On a Windows* System Using Visual Studio* Version 2017 or Newer
+
+Open "x64 Native Tools Command Prompt for VS2017" or 
+"x64 Native Tools Command Prompt for VS2019" and perform the following steps:
+1. Setup oneAPI development environment
+```
+C:\Program Files (x86)\intel\oneapi\setvars.bat
+```
+2. Build the program using `cmake`
+```
+mkdir build
+cd build
+cmake -G "Visual Studio 16 2019" ..
+cmake --build .
+```
+
+> Note: You can open the `simple_model.sln` in build folder to edit source
+> code with Microsoft Visual Studio integrated development environment.
+
+### Include Files
+The include folder is located at ${DNNLROOT}\include on your development system".
+
+3. Run the program
+```
+./bin/Debug/cnn-inference-f32-cpp.exe
+```
+
+### Include Files
+
+The include folder is located at ${DNNLROOT}\include on your development system".
+
+## Running the Sample
+
+### Running Samples In DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/)
+
+### Application Parameters
+
+You can specify target device for this sample using command line arguments:
+* `cpu` (default) directs the application to run on system's CPU
+* `gpu` directs the sample to run on Intel GPU
+
+> Note: When executed with `gpu` parameter the 
+> sample will return an error if there are no Intel GPUs are found in the system.
+
+You can get additional information during execution of this sample by setting
+environment variable `DNNL_VERBOSE=1`.
+
+### Example of Output
+
+```
+Use time: 28.84 ms per iteration.
+Example passed on CPU.
+```
+
+When executed with `DNNL_VERBOSE=1`:
+```
+dnnl_verbose,info,oneDNN v1.95.0 (commit ae08a30fff7f76759fd4c5093c01707d0ee12c4c)
+dnnl_verbose,info,cpu,runtime:DPC++
+dnnl_verbose,info,cpu,isa:Intel AVX2
+dnnl_verbose,info,gpu,runtime:DPC++
+dnnl_verbose,info,cpu,engine,0,backend:OpenCL,name:Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz,driver_version:2020.10.7
+dnnl_verbose,info,gpu,engine,0,backend:Level Zero,name:Intel(R) Gen12LP,driver_version:0.8.0
+dnnl_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb8a:f0,,,96x3x11x11,0.24292
+dnnl_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcde:f0 dst_f32::blocked:aBCde8c8b:f0,,,2x128x48x5x5,0.26709
+dnnl_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:ABcd8b8a:f0,,,384x256x3x3,1.16699
+...
+Use time: 20.11 ms per iteration.
+Example passed on CPU.
+```
+
diff --git a/Libraries/oneDNN/simple_model/sample.json b/Libraries/oneDNN/simple_model/sample.json
new file mode 100644
index 0000000000..3a883d7f20
--- /dev/null
+++ b/Libraries/oneDNN/simple_model/sample.json
@@ -0,0 +1,27 @@
+{
+ "guid": "389BBED3-456D-4092-B6D8-DDF782865D66",
+ "name": "oneDNN CNN FP32 Inference",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneDNN"],
+ "description": "Run a simple CNN on both Intel CPU and GPU with sample C++ codes.",
+ "toolchain": ["dpcpp","gcc","icc"],
+ "languages": [{"cpp":{}}],
+ "dependencies": ["oneDNN", "tbb","compiler|icc"],
+ "os": ["linux"],
+ "builder": ["ide","cmake"],
+ "targetDevice": ["CPU", "GPU"],
+ "ciTests": {
+	"linux": [{
+		"env": ["source /opt/intel/oneapi/setvars.sh --dnnl-configuration=cpu_dpcpp_gpu_dpcpp --force" ],
+		"id": "infer",
+		"steps": [
+			"mkdir build",
+      		        "cd build",
+           		"cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp",
+           		"make cnn-inference-f32-cpp",
+			"./bin/cnn-inference-f32-cpp cpu",
+			"SYCL_BE=PI_OPENCL ./bin/cnn-inference-f32-cpp gpu"
+		 ]
+	}]
+
+ }
+}
diff --git a/Libraries/oneDNN/tutorials/CMakeLists.txt b/Libraries/oneDNN/tutorials/CMakeLists.txt
new file mode 100644
index 0000000000..4c8993ce6b
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 2.8.11)
+if("${CMAKE_CXX_COMPILER}" STREQUAL "")
+       set(CMAKE_C_COMPILER "clang")
+       set(CMAKE_CXX_COMPILER "dpcpp")
+endif()
+project (oneDNN)
+if("$ENV{EXAMPLE_ROOT}" STREQUAL "")
+	message(" - use default examples")
+	add_subdirectory ($ENV{DNNLROOT}/examples out)
+else()
+	add_subdirectory ($ENV{EXAMPLE_ROOT} out)
+endif()
diff --git a/Libraries/oneDNN/tutorials/License.txt b/Libraries/oneDNN/tutorials/License.txt
new file mode 100644
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Libraries/oneDNN/tutorials/README.md b/Libraries/oneDNN/tutorials/README.md
new file mode 100644
index 0000000000..fcdafb80b6
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/README.md
@@ -0,0 +1,21 @@
+# Intel oneAPI Deep Neural Network Library (oneDNN)
+
+Deep Neural Networks Library for Deep Neural Networks (oneDNN) is an open-source performance library for deep learning applications. The library includes basic building blocks for neural networks optimized for Intel Architecture Processors and Intel Processor Graphics. oneDNN is intended for deep learning applications and framework developers interested in improving application performance on Intel CPUs and GPUs
+
+Github : https://github.com/oneapi-src/oneDNN
+
+## License  
+The code samples are licensed under MIT license
+
+# oneDNN Tutorials
+
+| Type      | Name                 | Description                                                  |
+| --------- | ----------------------- | ------------------------------------------------------------ |
+| Component | [getting_started](getting_started.ipynb)  | The sample also includes a Jupyter notebook with step by step instructions on building code with different compilers and runtime configurations oneDNN support. |
+| Component | [simple_model](simple_model.ipynb)| A Jupyter notebook with step by step instructions on running oneDNN-based application on a GPU. |
+| Component | [verbose_jitdump](verbose_jitdump.ipynb) | This Jupyter Notebook demonstrates how to use Verbose Mode and JIT Dump to profile oneDNN samples. |
+| Component | [analyze_isa_with_dispatcher_control](analyze_isa_with_dispatcher_control.ipynb) | This Jupyter Notebook demonstrates how to use CPU Dispatch Control to generate JIT codes among different ISA on CPU and also analyze JIT kernels among ISAs.|
+>  Notice : Please use Intel oneAPI DevCloud as the environment for jupyter notebook samples. \
+Users can refer to [DevCloud Getting Started](https://devcloud.intel.com/oneapi/get-started/) for using DevCloud \
+Users can use JupyterLab from DevCloud via "One-click Login in", and download samples via "git clone" or the "oneapi-cli" tool \
+Once users are in the JupyterLab with downloaded jupyter notebook samples, they can start following the steps without further installion needed.
diff --git a/Libraries/oneDNN/tutorials/analyze_isa_with_dispatcher_control.ipynb b/Libraries/oneDNN/tutorials/analyze_isa_with_dispatcher_control.ipynb
new file mode 100644
index 0000000000..34222da428
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/analyze_isa_with_dispatcher_control.ipynb
@@ -0,0 +1,872 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analyze ISA usage with Intel® oneAPI Deep Neural Network Library (oneDNN) Samples by using CPU Dispatcher Control\n",
+    "\n",
+    "## Learning Objectives\n",
+    "In this module the developer will:\n",
+    "* Learn how to use CPU Dispatch Control to generate JIT codes among different Instruction Set Architecture (ISA) on CPU\n",
+    "* Analyze different JIT Kernel and CPU instructions usage among different ISA\n",
+    "    - AVX512 vs AVX2\n",
+    "    - AVX512 VNNI vs AVX512\n",
+    "    - AVX512 BF16 vs AVX512 (Optional, no hardware support in DevCloud now.)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This module also shows the elapsed time percentage over different oneDNN JIT kernels, so users can also see the usage of specific JIT Kernels for VNNI or BF16 instructions.\n",
+    "\n",
+    "<img src=\"images/vnni.JPG\" style=\"float:left\" width=400>\n",
+    "<img src=\"images/bf16.JPG\" style=\"float:right\" width=400>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# CPU Dispatch Control and ISA Analysis Exercise\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## prerequisites\n",
+    "****\n",
+    "### Step 1: Prepare the build/run environment\n",
+    "oneDNN has four different configurations inside the Intel oneAPI toolkits. Each configuration is in a different folder under the oneDNN installation path, and each configuration supports a different compiler or threading library  \n",
+    "\n",
+    "Set the installation path of your oneAPI toolkit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# default path: /opt/intel/oneapi\n",
+    "%env ONEAPI_INSTALL=/opt/intel/oneapi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!printf '%s\\n'     $ONEAPI_INSTALL/dnnl/latest/cpu_*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you can see, there are four different folders under the oneDNN installation path, and each of those configurations supports different features. This tutorial will use the cpu_gomp configuration to do ISA analysis on CPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a lab folder for this exercise."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p lab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install required python packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get current platform information for this exercise."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from profiling.profile_utils import PlatformUtils\n",
+    "plat_utils = PlatformUtils()\n",
+    "plat_utils.dump_platform_info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###  Step 2: Preparing the samples code\n",
+    "\n",
+    "This exercise uses the cnn_inference_f32.cpp and cnn_inference_int8.cpp examples from the oneDNN installation path.\n",
+    "\n",
+    "The section below will copy the cnn_inference_f32.cpp and cnn_inference_int8.cpp files into lab folder.  \n",
+    "This section also copies the required header files and CMake file into the lab folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_gomp/examples/cnn_inference_f32.cpp lab/\n",
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_gomp/examples/cnn_inference_int8.cpp lab/\n",
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_gomp/examples/cpu_cnn_training_bf16.cpp lab/\n",
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_gomp/examples/example_utils.hpp lab/\n",
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_gomp/examples/example_utils.h lab/\n",
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_gomp/examples/CMakeLists.txt lab/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 3: Build and Run with GNU Compiler and OpenMP \n",
+    "One of the oneDNN configurations supports the GNU Compiler.\n",
+    "The following section shows you how to build with the GNU Compiler and run on CPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler **g++** command and flags that will generate the exectuable.\n",
+    "In order to use GNU compiler and related OMP runtime, some definitions must be passed as cmake arguments.\n",
+    "Here are related cmake arguments for cpu_gomp configuration: \n",
+    "\n",
+    "   -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DDNNL_CPU_RUNTIME=OMP -DDNNL_GPU_RUNTIME=NONE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_gomp --force> /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir cpu_gomp\n",
+    "cd cpu_gomp\n",
+    "cmake .. -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DDNNL_CPU_RUNTIME=OMP -DDNNL_GPU_RUNTIME=NONE\n",
+    "make\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the DevCloud or a local machine.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "The script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "The user can refer to run.sh below to run cnn-inference-f32-cpp on CPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_gomp --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "# enable verbose log\n",
+    "export DNNL_VERBOSE=0\n",
+    "./cpu_gomp/out/cnn-inference-f32-cpp\n",
+    "./cpu_gomp/out/cnn-inference-int8-cpp\n",
+    "./cpu_gomp/out/cpu-cnn-training-bf16-cpp\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit **build.sh** and **run.sh** to the job queue.\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts either on the Intel DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails, it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! rm -rf dpcpp;chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "## Run Time CPU Dispatcher Controls\n",
+    "***\n",
+    "In this section, we run workloads on the latest Xeon server from DevCloud, and use CPU dispatcher controls to generate JIT kernels among different ISA for comparison.\n",
+    "Users will understand the usage of different ISA by analyzing oneDNN Verbose logs and JIT Dump files.\n",
+    "Refer to the [link](https://oneapi-src.github.io/oneDNN/dev_guide_cpu_dispatcher_control.html) for detailed CPU Dispatcher Controls information"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When the feature is enabled at build-time, you can use the DNNL_MAX_CPU_ISA environment variable to limit processor features. oneDNN is able to detect to certain Instruction Set Architecture (ISA) and older instruction sets. It can also be used to enable ISAs with initial support in the library that are otherwise disabled by default.\n",
+    "\n",
+    "|Environment variable Value|Description| introduced with microarchitecture |\n",
+    "|:----|:-----|:-----|\n",
+    "|SSE41|Intel Streaming SIMD Extensions 4.1 (Intel SSE4.1)| Penryn |\n",
+    "|AVX|Intel Advanced Vector Extensions (Intel AVX)|Sandy Bridge |\n",
+    "|AVX2|Intel Advanced Vector Extensions 2 (Intel AVX2)| Haswell |\n",
+    "|AVX512_CORE|Intel AVX-512 with AVX512BW, AVX512VL, and AVX512DQ extensions| Skylake-X |\n",
+    "|AVX512_CORE_VNNI|Intel AVX-512 with Intel Deep Learning Boost (Intel DL Boost)| Cascade Lake |\n",
+    "|AVX512_CORE_BF16|Intel AVX-512 with Intel DL Boost and bfloat16 support| Cooper Lake |\n",
+    "|ALL|No restrictions on the above ISAs, but excludes the below ISAs with initial support in the library (default)| |\n",
+    "|AVX512_CORE_AMX|Intel AVX-512 with Intel DL Boost and bfloat16 support and Intel Advanced Matrix Extensions (Intel AMX) with 8-bit integer and bfloat16 support (initial support) | |\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## ISA Comparison\n",
+    "***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The section below compares and analyzes different ISA upon JIT Kernel usage and CPU instruction usage.\n",
+    "\n",
+    "The table below shows the different comparison by using different oneDNN samples,   \n",
+    "and also brings up the keypoint of the comparison. \n",
+    "\n",
+    "|ISA Comparation | oneDNN sample | Description | \n",
+    "|:----|:-----|:-----|\n",
+    "|AVX512 vs AVX2 |cnn-inference-f32-cpp| show the usage of zmm instruction and avx512 JIT kernel | \n",
+    "|AVX512 VNNI vs AVX512 |cnn-inference-int8-cpp| show the usage of VNNI instruction and VNNI JIT kernel|\n",
+    "|AVX512 BF16 vs AVX512| cnn-training-bf16-cpp| show the usage of BF16 instruction and BF16 JIT kernel| \n",
+    "\n",
+    "Those comparisons can be conducted on the same CPU microarchitecture with the help of oneDNN CPU dispatcher control.  \n",
+    "Users can also conduct similiar comparisons for TensorFlow or PyTorch workloads by replacing the oneDNN sample with other workloads.  \n",
+    "By conducting similar comparisons of real workloads, users can understand:  \n",
+    "* Whether the workloads leverage the latest instructions like VNNI on the platform\n",
+    "* How much performance benefit is gained by using the latest instruction on the same platform\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 1: Pick one of ISA comparisons\n",
+    "After users pick an ISA comparison, related environment variables will be exported.  \n",
+    "  \n",
+    "The section below will list out all ISA comparison options with index number."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ISA_COMPARISON_LIST=[\"avx512_avx2\",\"avx512-vnni_avx512\",\"avx512-bf16_avx512\"]\n",
+    "index =0 \n",
+    "for ISA_C in ISA_COMPARISON_LIST:\n",
+    "    print(\" %d : %s \" %(index, ISA_C))\n",
+    "    index+=1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Please select a comparison option and assign its index to the ISAIndex variable.\n",
+    ">NOTE: no bf16 support in DevCloud now. Please **IGNORE avx512-bf16_avx512** comparison."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ISAIndex=0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The section below will export related environment variables according to the selected ISA comparison."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ISA_COMPARISON = ISA_COMPARISON_LIST[ISAIndex]\n",
+    "print(\" Compare between \", ISA_COMPARISON)\n",
+    "import os\n",
+    "if ISA_COMPARISON == \"avx512_avx2\":\n",
+    "    # variables for AVX2\n",
+    "    os.environ[\"DNNL_MAX_CPU_ISA_VAL1\"] = \"AVX2\"\n",
+    "    os.environ[\"DNNL_APP_VAL1\"] = \"cnn-inference-f32-cpp\"\n",
+    "    os.environ[\"DNNL_LOG_VAL1\"] = \"log_cpu_f32_avx2.csv\"\n",
+    "    os.environ[\"DNNL_JIT_FD_VAL1\"] = \"jitdump_f32_avx2\"\n",
+    "    # variables for AVX512\n",
+    "    os.environ[\"DNNL_MAX_CPU_ISA_VAL2\"] = \"AVX512_CORE\"\n",
+    "    os.environ[\"DNNL_APP_VAL2\"] = \"cnn-inference-f32-cpp\"\n",
+    "    os.environ[\"DNNL_LOG_VAL2\"] = \"log_cpu_f32_avx512.csv\"\n",
+    "    os.environ[\"DNNL_JIT_FD_VAL2\"] = \"jitdump_f32_avx512\"\n",
+    "    # AVX512 specific register\n",
+    "    os.environ[\"DNNL_ISA_KEYWORD\"] = \"zmm\"\n",
+    "    \n",
+    "elif ISA_COMPARISON == \"avx512-vnni_avx512\":\n",
+    "    # variables for AVX512\n",
+    "    os.environ[\"DNNL_MAX_CPU_ISA_VAL1\"] = \"AVX512_CORE\"\n",
+    "    os.environ[\"DNNL_APP_VAL1\"] = \"cnn-inference-int8-cpp\"\n",
+    "    os.environ[\"DNNL_LOG_VAL1\"] = \"log_cpu_int8_avx512.csv\"\n",
+    "    os.environ[\"DNNL_JIT_FD_VAL1\"] = \"jitdump_int8_avx512\"\n",
+    "    # variables for AVX512 VNNI\n",
+    "    os.environ[\"DNNL_MAX_CPU_ISA_VAL2\"] = \"AVX512_CORE_VNNI\"\n",
+    "    os.environ[\"DNNL_APP_VAL2\"] = \"cnn-inference-int8-cpp\"\n",
+    "    os.environ[\"DNNL_LOG_VAL2\"] = \"log_cpu_int8_avx512_vnni.csv\"\n",
+    "    os.environ[\"DNNL_JIT_FD_VAL2\"] = \"jitdump_int8_avx512_vnni\"\n",
+    "    # VNNI specific instruction\n",
+    "    os.environ[\"DNNL_ISA_KEYWORD\"] = \"vpdpbusd\"   \n",
+    "    \n",
+    "elif ISA_COMPARISON == \"avx512-bf16_avx512\":\n",
+    "    # variables for AVX512\n",
+    "    os.environ[\"DNNL_MAX_CPU_ISA_VAL1\"] = \"AVX512_CORE\"\n",
+    "    os.environ[\"DNNL_APP_VAL1\"] = \"cpu-cnn-training-bf16-cpp\"\n",
+    "    os.environ[\"DNNL_LOG_VAL1\"] = \"log_cpu_bf16_avx512.csv\"\n",
+    "    os.environ[\"DNNL_JIT_FD_VAL1\"] = \"jitdump_bf16_avx512\"\n",
+    "    # variables for AVX512 BF16\n",
+    "    os.environ[\"DNNL_MAX_CPU_ISA_VAL2\"] = \"AVX512_CORE_BF16\"\n",
+    "    os.environ[\"DNNL_APP_VAL2\"] = \"cpu-cnn-training-bf16-cpp\"\n",
+    "    os.environ[\"DNNL_LOG_VAL2\"] = \"log_cpu_bf16_avx512_bf16.csv\"\n",
+    "    os.environ[\"DNNL_JIT_FD_VAL2\"] = \"jitdump_bf16_avx512_bf16\"\n",
+    "    # BF16 specific instructions\n",
+    "    os.environ[\"DNNL_ISA_KEYWORD\"] = \"vdpbf16ps|vcvtne2ps2bf16\"        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 2: Script - run.sh for first selected ISA.    ex: AVX2, or AVX512_CORE\n",
+    "****\n",
+    "The script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "The user can refer to run.sh below to run the oneDNN sample on CPU with the selcted ISA."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "  \n",
+    "print out the selected ISA."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! echo $DNNL_MAX_CPU_ISA_VAL1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "prepare run.sh and use DNNL_MAX_CPU_ISA to run sample on selected ISA."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_gomp --force  > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "# enable verbose log\n",
+    "export DNNL_VERBOSE=2 \n",
+    "# enable JIT Dump\n",
+    "export DNNL_JIT_DUMP=1\n",
+    "\n",
+    "DNNL_MAX_CPU_ISA=$DNNL_MAX_CPU_ISA_VAL1 ./cpu_gomp/out/$DNNL_APP_VAL1 cpu >> $DNNL_LOG_VAL1 2>&1\n",
+    "\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting  **run.sh** to the job queue\n",
+    "> NOTE: By assigning clx to property, users can execute the sample on a Cascade Lake platform from Intel DevCloud."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! export property=clx; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q run.sh; else ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "####  gather all JIT bin files into a folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! rm -rf $DNNL_JIT_FD_VAL1; mkdir $DNNL_JIT_FD_VAL1; mv *.bin $DNNL_JIT_FD_VAL1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 3: Script - run.sh for second selected ISA. ex: AVX512_CORE_VNNI or AVX512_CORE_BF16\n",
+    "**** \n",
+    "The script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "The user can refer to run.sh below to run the oneDNN sample on CPU with the selcted ISA."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "  \n",
+    "print out the selected ISA."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! echo $DNNL_MAX_CPU_ISA_VAL2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "prepare run.sh and use DNNL_MAX_CPU_ISA to run sample on selected ISA."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_gomp --force  > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "# enable verbose log\n",
+    "export DNNL_VERBOSE=2 \n",
+    "# enable JIT Dump\n",
+    "export DNNL_JIT_DUMP=1\n",
+    "\n",
+    "DNNL_MAX_CPU_ISA=$DNNL_MAX_CPU_ISA_VAL2 ./cpu_gomp/out/$DNNL_APP_VAL2 cpu >> $DNNL_LOG_VAL2 2>&1\n",
+    "\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting  **run.sh** to the job queue\n",
+    "> NOTE: By assigning clx to property, users can execute the sample on a Cascade Lake platform from Intel DevCloud.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! export property=clx; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q run.sh; else ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "####  gather all JIT bin files into a folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! rm -rf $DNNL_JIT_FD_VAL2; mkdir $DNNL_JIT_FD_VAL2; mv *.bin $DNNL_JIT_FD_VAL2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "****\n",
+    "### Step 4: oneDNN Verbose Log JIT Kernel Time BreakDown\n",
+    "oneDNN uses just-in-time compilation (JIT) to generate optimal code for some functions based on input parameters and instruction set supported by the system.   \n",
+    "Therefore, users can see different JIT kernel type among different first selected ISA and second selected ISA.   \n",
+    "For example, users can see avx_core_vnni JIT kernel if the workload uses VNNI instruction on Cascake Lake platform.  \n",
+    "Moreover, users can identify the top hotspots of JIT kernel executions with this time breakdown. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Parse verbose log and get the data back"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from profiling.profile_utils import oneDNNUtils, oneDNNLog\n",
+    "onednn = oneDNNUtils()\n",
+    "\n",
+    "logfile1 = os.environ[\"DNNL_LOG_VAL1\"]\n",
+    "log1 = oneDNNLog()\n",
+    "log1.load_log(logfile1)\n",
+    "exec_data1 = log1.exec_data\n",
+    "\n",
+    "logfile2 = os.environ[\"DNNL_LOG_VAL2\"]\n",
+    "log2 = oneDNNLog()\n",
+    "log2.load_log(logfile2)\n",
+    "exec_data2 = log2.exec_data\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "####   JIT Kernel Type Time breakdown for first selected ISA  \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onednn.breakdown(exec_data1,\"jit\",\"time\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "####   JIT Kernel Type Time breakdown for second selected ISA\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> NOTE: users should be able to see **avx512_core_vnni** JIT Kernel if the sample run with **VNNI** instruction  \n",
+    "> NOTE: users should be able to see **avx512_core_bf16** JIT Kernel if the sample run with **BF16** instruction  \n",
+    "> NOTE: users should be able to see **avx512** JIT Kernel if the sample run with **AVX512** instructions  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onednn.breakdown(exec_data2,\"jit\",\"time\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "####   Primitives Type Speedup from second selected ISA\n",
+    "oneDNN samples here are not for performance benchmarking, so the digram below gives you only a rough idea of performance speedup from the second selected ISA such as AVX512, VNNI, or BF16."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " onednn.stats_comp('type', 'time',log2, log1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "****\n",
+    "### Step 5: Inspect JIT Kernel \n",
+    "In this section, we analyze dump JIT files on the built samples from Step 2 and Step 3.   \n",
+    "Users should be able to see exact CPU instruction usage like VNNI or BF16 from those JIT Dump files."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### inspect either first or second selected ISA by setting VALIndex."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* To inspect the first selected ISA JIT Dump files, set VALIndex as 1.  \n",
+    "* To inspect second selected ISA JIT Dump files, set VALIndex as 2.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "VALIndex=2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### List out all JIT Dump Files with index number for the first or second selected ISA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "VAL=\"DNNL_JIT_FD_VAL\"+str(VALIndex)\n",
+    "JIT_DUMP_FD=os.environ[VAL]\n",
+    "print(\"Inspect Folder: \", JIT_DUMP_FD)\n",
+    "\n",
+    "filenames= os.listdir (JIT_DUMP_FD) \n",
+    "result = []\n",
+    "keyword = \".bin\"\n",
+    "for filename in filenames: \n",
+    "    #if os.path.isdir(os.path.join(os.path.abspath(\".\"), filename)): \n",
+    "    if filename.find(keyword) != -1:\n",
+    "        result.append(filename)\n",
+    "result.sort()\n",
+    "\n",
+    "index =0 \n",
+    "for folder in result:\n",
+    "    print(\" %d : %s \" %(index, folder))\n",
+    "    index+=1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Pick a JIT Dump file by putting its index value below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FdIndex=0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### export JIT Dump file to environment variable JITFILE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if FdIndex < len(result):\n",
+    "    logfile = result[FdIndex]\n",
+    "    os.environ[\"JITFILE\"] = JIT_DUMP_FD+os.sep+logfile"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### disassembler JIT Dump file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> NOTE: zmm register is introduced by AVX512 ISA.  \n",
+    "Users should see usage of **zmm** register in AVX512 JIT dump files.  \n",
+    "\n",
+    "> NOTE: vpdpbusd is introduced by AVX512_VNNI ISA.  \n",
+    "Users should see usage of **vpdpbusd** in AVX512_VNNI JIT dump files. \n",
+    "\n",
+    "> NOTE: **vdpbf16ps**, **vcvtne2ps2bf16**, and **vcvtneps2bf16** are introduced by AVX512_BF16 ISA.  \n",
+    "Users should see usage of vdpbf16ps, vcvtne2ps2bf16 or vcvtneps2bf16 in AVX512_BF16 JIT dump files. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> NOTE: For disassembler vdpbf16ps, vcvtne2ps2bf16, and vcvtneps2bf16 instructions, users must use objdump with **v2.34** or above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!objdump -D -b binary -mi386:x86-64 $JITFILE | grep -E $DNNL_ISA_KEYWORD"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# Summary\n",
+    "In this lab the developer learned the following:\n",
+    "* use CPU Dispatch Control to generate JIT codes among different Instruction Set Architecture on CPU\n",
+    "* understand different JIT Kernels and CPU instructions usage among different ISA\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "525.6px",
+    "left": "28px",
+    "top": "137.8px",
+    "width": "301.109px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/Libraries/oneDNN/tutorials/codes_for_ipynb/cnn_inference_f32.cpp b/Libraries/oneDNN/tutorials/codes_for_ipynb/cnn_inference_f32.cpp
new file mode 100644
index 0000000000..606c8ddc77
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/codes_for_ipynb/cnn_inference_f32.cpp
@@ -0,0 +1,700 @@
+
+#include <assert.h>
+
+#include <chrono>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "dnnl.hpp"
+
+using namespace dnnl;
+
+using namespace std;
+
+memory::dim product(const memory::dims &dims) {
+    return std::accumulate(dims.begin(), dims.end(), (memory::dim)1,
+            std::multiplies<memory::dim>());
+}
+
+void simple_net(int times = 100) {
+    using tag = memory::format_tag;
+    using dt = memory::data_type;
+
+
+    engine eng(engine::kind::cpu, 0);
+    stream s(eng);
+
+    std::vector<primitive> net;
+    std::vector<std::unordered_map<int, memory>> net_args;
+
+
+    const memory::dim batch = 1;
+
+    // AlexNet: conv1
+    // {batch, 3, 227, 227} (x) {96, 3, 11, 11} -> {batch, 96, 55, 55}
+    // strides: {4, 4}
+    memory::dims conv1_src_tz = { batch, 3, 227, 227 };
+    memory::dims conv1_weights_tz = { 96, 3, 11, 11 };
+    memory::dims conv1_bias_tz = { 96 };
+    memory::dims conv1_dst_tz = { batch, 96, 55, 55 };
+    memory::dims conv1_strides = { 4, 4 };
+    memory::dims conv1_padding = { 0, 0 };
+
+/// Allocate buffers for input and output data, weights, and bias.
+/// @snippet cpu_cnn_inference_f32.cpp Allocate buffers
+//[Allocate buffers]
+    std::vector<float> user_src(batch * 3 * 227 * 227);
+    std::vector<float> user_dst(batch * 1000);
+    std::vector<float> conv1_weights(product(conv1_weights_tz));
+    std::vector<float> conv1_bias(product(conv1_bias_tz));
+//[Allocate buffers]
+
+
+    auto user_src_memory = memory(
+            { { conv1_src_tz }, dt::f32, tag::nchw }, eng, user_src.data());
+    auto user_weights_memory
+            = memory({ { conv1_weights_tz }, dt::f32, tag::oihw }, eng,
+                    conv1_weights.data());
+    auto conv1_user_bias_memory = memory(
+            { { conv1_bias_tz }, dt::f32, tag::x }, eng, conv1_bias.data());
+
+
+
+    auto conv1_src_md = memory::desc({ conv1_src_tz }, dt::f32, tag::any);
+    auto conv1_bias_md = memory::desc({ conv1_bias_tz }, dt::f32, tag::any);
+    auto conv1_weights_md
+            = memory::desc({ conv1_weights_tz }, dt::f32, tag::any);
+    auto conv1_dst_md = memory::desc({ conv1_dst_tz }, dt::f32, tag::any);
+
+
+
+    auto conv1_desc = convolution_forward::desc(prop_kind::forward_inference,
+            algorithm::convolution_direct, conv1_src_md, conv1_weights_md, conv1_bias_md,
+            conv1_dst_md, conv1_strides, conv1_padding, conv1_padding);
+
+    auto conv1_prim_desc = convolution_forward::primitive_desc(conv1_desc, eng);
+
+
+    auto conv1_src_memory = user_src_memory;
+    if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) {
+        conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng);
+        net.push_back(reorder(user_src_memory, conv1_src_memory));
+        net_args.push_back({ { DNNL_ARG_FROM, user_src_memory },
+                { DNNL_ARG_TO, conv1_src_memory } });
+    }
+
+    auto conv1_weights_memory = user_weights_memory;
+    if (conv1_prim_desc.weights_desc() != user_weights_memory.get_desc()) {
+        conv1_weights_memory = memory(conv1_prim_desc.weights_desc(), eng);
+        reorder(user_weights_memory, conv1_weights_memory)
+                .execute(s, user_weights_memory, conv1_weights_memory);
+    }
+
+    auto conv1_dst_memory = memory(conv1_prim_desc.dst_desc(), eng);
+
+
+
+    net.push_back(convolution_forward(conv1_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_src_memory },
+            { DNNL_ARG_WEIGHTS, conv1_weights_memory },
+            { DNNL_ARG_BIAS, conv1_user_bias_memory },
+            { DNNL_ARG_DST, conv1_dst_memory } });
+
+
+    // AlexNet: relu1
+    // {batch, 96, 55, 55} -> {batch, 96, 55, 55}
+    const float negative1_slope = 1.0f;
+
+
+
+    auto relu1_desc = eltwise_forward::desc(prop_kind::forward_inference,
+            algorithm::eltwise_relu, conv1_dst_memory.get_desc(),
+            negative1_slope);
+    auto relu1_prim_desc = eltwise_forward::primitive_desc(relu1_desc, eng);
+
+    net.push_back(eltwise_forward(relu1_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_dst_memory },
+            { DNNL_ARG_DST, conv1_dst_memory } });
+
+
+    // AlexNet: lrn1
+    // {batch, 96, 55, 55} -> {batch, 96, 55, 55}
+    // local size: 5
+    // alpha1: 0.0001
+    // beta1: 0.75
+    const memory::dim local1_size = 5;
+    const float alpha1 = 0.0001f;
+    const float beta1 = 0.75f;
+    const float k1 = 1.0f;
+
+    // create lrn primitive and add it to net
+    auto lrn1_desc = lrn_forward::desc(prop_kind::forward_inference,
+            algorithm::lrn_across_channels, conv1_dst_memory.get_desc(), local1_size,
+            alpha1, beta1, k1);
+    auto lrn1_prim_desc = lrn_forward::primitive_desc(lrn1_desc, eng);
+    auto lrn1_dst_memory = memory(lrn1_prim_desc.dst_desc(), eng);
+
+    net.push_back(lrn_forward(lrn1_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_dst_memory },
+            { DNNL_ARG_DST, lrn1_dst_memory } });
+
+    // AlexNet: pool1
+    // {batch, 96, 55, 55} -> {batch, 96, 27, 27}
+    // kernel: {3, 3}
+    // strides: {2, 2}
+    memory::dims pool1_dst_tz = { batch, 96, 27, 27 };
+    memory::dims pool1_kernel = { 3, 3 };
+    memory::dims pool1_strides = { 2, 2 };
+    memory::dims pool_padding = { 0, 0 };
+
+    auto pool1_dst_md = memory::desc({ pool1_dst_tz }, dt::f32, tag::any);
+
+    auto pool1_desc = pooling_forward::desc(prop_kind::forward_inference,
+            algorithm::pooling_max, lrn1_dst_memory.get_desc(), pool1_dst_md,
+            pool1_strides, pool1_kernel, pool_padding, pool_padding);
+    auto pool1_pd = pooling_forward::primitive_desc(pool1_desc, eng);
+    auto pool1_dst_memory = memory(pool1_pd.dst_desc(), eng);
+
+    net.push_back(pooling_forward(pool1_pd));
+    net_args.push_back({ { DNNL_ARG_SRC, lrn1_dst_memory },
+            { DNNL_ARG_DST, pool1_dst_memory } });
+
+    // AlexNet: conv2
+    // {batch, 96, 27, 27} (x) {2, 128, 48, 5, 5} -> {batch, 256, 27, 27}
+    // strides: {1, 1}
+    memory::dims conv2_src_tz = { batch, 96, 27, 27 };
+    memory::dims conv2_weights_tz = { 2, 128, 48, 5, 5 };
+    memory::dims conv2_bias_tz = { 256 };
+    memory::dims conv2_dst_tz = { batch, 256, 27, 27 };
+    memory::dims conv2_strides = { 1, 1 };
+    memory::dims conv2_padding = { 2, 2 };
+
+    std::vector<float> conv2_weights(product(conv2_weights_tz));
+    std::vector<float> conv2_bias(product(conv2_bias_tz));
+
+    // create memory for user data
+    auto conv2_user_weights_memory
+            = memory({ { conv2_weights_tz }, dt::f32, tag::goihw }, eng,
+                    conv2_weights.data());
+    auto conv2_user_bias_memory = memory(
+            { { conv2_bias_tz }, dt::f32, tag::x }, eng, conv2_bias.data());
+
+    // create memory descriptors for convolution data w/ no specified format
+    auto conv2_src_md = memory::desc({ conv2_src_tz }, dt::f32, tag::any);
+    auto conv2_bias_md = memory::desc({ conv2_bias_tz }, dt::f32, tag::any);
+    auto conv2_weights_md
+            = memory::desc({ conv2_weights_tz }, dt::f32, tag::any);
+    auto conv2_dst_md = memory::desc({ conv2_dst_tz }, dt::f32, tag::any);
+
+    // create a convolution
+    auto conv2_desc = convolution_forward::desc(prop_kind::forward_inference,
+            algorithm::convolution_direct, conv2_src_md, conv2_weights_md, conv2_bias_md,
+            conv2_dst_md, conv2_strides, conv2_padding, conv2_padding);
+    auto conv2_prim_desc = convolution_forward::primitive_desc(conv2_desc, eng);
+
+    auto conv2_src_memory = pool1_dst_memory;
+    if (conv2_prim_desc.src_desc() != conv2_src_memory.get_desc()) {
+        conv2_src_memory = memory(conv2_prim_desc.src_desc(), eng);
+        net.push_back(reorder(pool1_dst_memory, conv2_src_memory));
+        net_args.push_back({ { DNNL_ARG_FROM, pool1_dst_memory },
+                { DNNL_ARG_TO, conv2_src_memory } });
+    }
+
+    auto conv2_weights_memory = conv2_user_weights_memory;
+    if (conv2_prim_desc.weights_desc()
+            != conv2_user_weights_memory.get_desc()) {
+        conv2_weights_memory = memory(conv2_prim_desc.weights_desc(), eng);
+        reorder(conv2_user_weights_memory, conv2_weights_memory)
+                .execute(s, conv2_user_weights_memory, conv2_weights_memory);
+    }
+
+    auto conv2_dst_memory = memory(conv2_prim_desc.dst_desc(), eng);
+
+    // create convolution primitive and add it to net
+    net.push_back(convolution_forward(conv2_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_src_memory },
+            { DNNL_ARG_WEIGHTS, conv2_weights_memory },
+            { DNNL_ARG_BIAS, conv2_user_bias_memory },
+            { DNNL_ARG_DST, conv2_dst_memory } });
+
+    // AlexNet: relu2
+    // {batch, 256, 27, 27} -> {batch, 256, 27, 27}
+    const float negative2_slope = 1.0f;
+
+    // create relu primitive and add it to net
+    auto relu2_desc = eltwise_forward::desc(prop_kind::forward_inference,
+            algorithm::eltwise_relu, conv2_dst_memory.get_desc(),
+            negative2_slope);
+    auto relu2_prim_desc = eltwise_forward::primitive_desc(relu2_desc, eng);
+
+    net.push_back(eltwise_forward(relu2_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_dst_memory },
+            { DNNL_ARG_DST, conv2_dst_memory } });
+
+    // AlexNet: lrn2
+    // {batch, 256, 27, 27} -> {batch, 256, 27, 27}
+    // local size: 5
+    // alpha2: 0.0001
+    // beta2: 0.75
+    const memory::dim local2_size = 5;
+    const float alpha2 = 0.0001f;
+    const float beta2 = 0.75f;
+    const float k2 = 1.0f;
+
+    // create lrn primitive and add it to net
+    auto lrn2_desc = lrn_forward::desc(prop_kind::forward_inference,
+            algorithm::lrn_across_channels, conv2_prim_desc.dst_desc(), local2_size,
+            alpha2, beta2, k2);
+    auto lrn2_prim_desc = lrn_forward::primitive_desc(lrn2_desc, eng);
+    auto lrn2_dst_memory = memory(lrn2_prim_desc.dst_desc(), eng);
+
+    net.push_back(lrn_forward(lrn2_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_dst_memory },
+            { DNNL_ARG_DST, lrn2_dst_memory } });
+
+    // AlexNet: pool2
+    // {batch, 256, 27, 27} -> {batch, 256, 13, 13}
+    // kernel: {3, 3}
+    // strides: {2, 2}
+    memory::dims pool2_dst_tz = { batch, 256, 13, 13 };
+    memory::dims pool2_kernel = { 3, 3 };
+    memory::dims pool2_strides = { 2, 2 };
+    memory::dims pool2_padding = { 0, 0 };
+
+    auto pool2_dst_md = memory::desc({ pool2_dst_tz }, dt::f32, tag::any);
+
+    // create a pooling
+    auto pool2_desc = pooling_forward::desc(prop_kind::forward_inference,
+            algorithm::pooling_max, lrn2_dst_memory.get_desc(), pool2_dst_md,
+            pool2_strides, pool2_kernel, pool2_padding, pool2_padding);
+    auto pool2_pd = pooling_forward::primitive_desc(pool2_desc, eng);
+    auto pool2_dst_memory = memory(pool2_pd.dst_desc(), eng);
+
+    // create pooling primitive an add it to net
+    net.push_back(pooling_forward(pool2_pd));
+    net_args.push_back({ { DNNL_ARG_SRC, lrn2_dst_memory },
+            { DNNL_ARG_DST, pool2_dst_memory } });
+
+    // AlexNet: conv3
+    // {batch, 256, 13, 13} (x)  {384, 256, 3, 3}; -> {batch, 384, 13, 13};
+    // strides: {1, 1}
+    memory::dims conv3_src_tz = { batch, 256, 13, 13 };
+    memory::dims conv3_weights_tz = { 384, 256, 3, 3 };
+    memory::dims conv3_bias_tz = { 384 };
+    memory::dims conv3_dst_tz = { batch, 384, 13, 13 };
+    memory::dims conv3_strides = { 1, 1 };
+    memory::dims conv3_padding = { 1, 1 };
+
+    std::vector<float> conv3_weights(product(conv3_weights_tz));
+    std::vector<float> conv3_bias(product(conv3_bias_tz));
+
+    // create memory for user data
+    auto conv3_user_weights_memory
+            = memory({ { conv3_weights_tz }, dt::f32, tag::oihw }, eng,
+                    conv3_weights.data());
+    auto conv3_user_bias_memory = memory(
+            { { conv3_bias_tz }, dt::f32, tag::x }, eng, conv3_bias.data());
+
+    // create memory descriptors for convolution data w/ no specified format
+    auto conv3_src_md = memory::desc({ conv3_src_tz }, dt::f32, tag::any);
+    auto conv3_bias_md = memory::desc({ conv3_bias_tz }, dt::f32, tag::any);
+    auto conv3_weights_md
+            = memory::desc({ conv3_weights_tz }, dt::f32, tag::any);
+    auto conv3_dst_md = memory::desc({ conv3_dst_tz }, dt::f32, tag::any);
+
+    // create a convolution
+    auto conv3_desc = convolution_forward::desc(prop_kind::forward_inference,
+            algorithm::convolution_direct, conv3_src_md, conv3_weights_md, conv3_bias_md,
+            conv3_dst_md, conv3_strides, conv3_padding, conv3_padding);
+    auto conv3_prim_desc = convolution_forward::primitive_desc(conv3_desc, eng);
+
+    auto conv3_src_memory = pool2_dst_memory;
+    if (conv3_prim_desc.src_desc() != conv3_src_memory.get_desc()) {
+        conv3_src_memory = memory(conv3_prim_desc.src_desc(), eng);
+        net.push_back(reorder(pool2_dst_memory, conv3_src_memory));
+        net_args.push_back({ { DNNL_ARG_FROM, pool2_dst_memory },
+                { DNNL_ARG_TO, conv3_src_memory } });
+    }
+
+    auto conv3_weights_memory = conv3_user_weights_memory;
+    if (conv3_prim_desc.weights_desc()
+            != conv3_user_weights_memory.get_desc()) {
+        conv3_weights_memory = memory(conv3_prim_desc.weights_desc(), eng);
+        reorder(conv3_user_weights_memory, conv3_weights_memory)
+                .execute(s, conv3_user_weights_memory, conv3_weights_memory);
+    }
+
+    auto conv3_dst_memory = memory(conv3_prim_desc.dst_desc(), eng);
+
+    // create convolution primitive and add it to net
+    net.push_back(convolution_forward(conv3_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv3_src_memory },
+            { DNNL_ARG_WEIGHTS, conv3_weights_memory },
+            { DNNL_ARG_BIAS, conv3_user_bias_memory },
+            { DNNL_ARG_DST, conv3_dst_memory } });
+
+    // AlexNet: relu3
+    // {batch, 384, 13, 13} -> {batch, 384, 13, 13}
+    const float negative3_slope = 1.0f;
+
+    // create relu primitive and add it to net
+    auto relu3_desc = eltwise_forward::desc(prop_kind::forward_inference,
+            algorithm::eltwise_relu, conv3_dst_memory.get_desc(),
+            negative3_slope);
+    auto relu3_prim_desc = eltwise_forward::primitive_desc(relu3_desc, eng);
+
+    net.push_back(eltwise_forward(relu3_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv3_dst_memory },
+            { DNNL_ARG_DST, conv3_dst_memory } });
+
+    // AlexNet: conv4
+    // {batch, 384, 13, 13} (x)  {2, 192, 192, 3, 3}; ->
+    // {batch, 384, 13, 13};
+    // strides: {1, 1}
+    memory::dims conv4_src_tz = { batch, 384, 13, 13 };
+    memory::dims conv4_weights_tz = { 2, 192, 192, 3, 3 };
+    memory::dims conv4_bias_tz = { 384 };
+    memory::dims conv4_dst_tz = { batch, 384, 13, 13 };
+    memory::dims conv4_strides = { 1, 1 };
+    memory::dims conv4_padding = { 1, 1 };
+
+    std::vector<float> conv4_weights(product(conv4_weights_tz));
+    std::vector<float> conv4_bias(product(conv4_bias_tz));
+
+    // create memory for user data
+    auto conv4_user_weights_memory
+            = memory({ { conv4_weights_tz }, dt::f32, tag::goihw }, eng,
+                    conv4_weights.data());
+    auto conv4_user_bias_memory = memory(
+            { { conv4_bias_tz }, dt::f32, tag::x }, eng, conv4_bias.data());
+
+    // create memory descriptors for convolution data w/ no specified format
+    auto conv4_src_md = memory::desc({ conv4_src_tz }, dt::f32, tag::any);
+    auto conv4_bias_md = memory::desc({ conv4_bias_tz }, dt::f32, tag::any);
+    auto conv4_weights_md
+            = memory::desc({ conv4_weights_tz }, dt::f32, tag::any);
+    auto conv4_dst_md = memory::desc({ conv4_dst_tz }, dt::f32, tag::any);
+
+    // create a convolution
+    auto conv4_desc = convolution_forward::desc(prop_kind::forward_inference,
+            algorithm::convolution_direct, conv4_src_md, conv4_weights_md, conv4_bias_md,
+            conv4_dst_md, conv4_strides, conv4_padding, conv4_padding);
+    auto conv4_prim_desc = convolution_forward::primitive_desc(conv4_desc, eng);
+
+    auto conv4_src_memory = conv3_dst_memory;
+    if (conv4_prim_desc.src_desc() != conv4_src_memory.get_desc()) {
+        conv4_src_memory = memory(conv4_prim_desc.src_desc(), eng);
+        net.push_back(reorder(conv3_dst_memory, conv4_src_memory));
+        net_args.push_back({ { DNNL_ARG_FROM, conv3_dst_memory },
+                { DNNL_ARG_TO, conv4_src_memory } });
+    }
+
+    auto conv4_weights_memory = conv4_user_weights_memory;
+    if (conv4_prim_desc.weights_desc()
+            != conv4_user_weights_memory.get_desc()) {
+        conv4_weights_memory = memory(conv4_prim_desc.weights_desc(), eng);
+        reorder(conv4_user_weights_memory, conv4_weights_memory)
+                .execute(s, conv4_user_weights_memory, conv4_weights_memory);
+    }
+
+    auto conv4_dst_memory = memory(conv4_prim_desc.dst_desc(), eng);
+
+    // create convolution primitive and add it to net
+    net.push_back(convolution_forward(conv4_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv4_src_memory },
+            { DNNL_ARG_WEIGHTS, conv4_weights_memory },
+            { DNNL_ARG_BIAS, conv4_user_bias_memory },
+            { DNNL_ARG_DST, conv4_dst_memory } });
+
+    // AlexNet: relu4
+    // {batch, 384, 13, 13} -> {batch, 384, 13, 13}
+    const float negative4_slope = 1.0f;
+
+    // create relu primitive and add it to net
+    auto relu4_desc = eltwise_forward::desc(prop_kind::forward_inference,
+            algorithm::eltwise_relu, conv4_dst_memory.get_desc(),
+            negative4_slope);
+    auto relu4_prim_desc = eltwise_forward::primitive_desc(relu4_desc, eng);
+
+    net.push_back(eltwise_forward(relu4_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv4_dst_memory },
+            { DNNL_ARG_DST, conv4_dst_memory } });
+
+    // AlexNet: conv5
+    // {batch, 384, 13, 13} (x)  {2, 128, 192, 3, 3}; -> {batch, 256, 13, 13};
+    // strides: {1, 1}
+    memory::dims conv5_src_tz = { batch, 384, 13, 13 };
+    memory::dims conv5_weights_tz = { 2, 128, 192, 3, 3 };
+    memory::dims conv5_bias_tz = { 256 };
+    memory::dims conv5_dst_tz = { batch, 256, 13, 13 };
+    memory::dims conv5_strides = { 1, 1 };
+    memory::dims conv5_padding = { 1, 1 };
+
+    std::vector<float> conv5_weights(product(conv5_weights_tz));
+    std::vector<float> conv5_bias(product(conv5_bias_tz));
+
+    // create memory for user data
+    auto conv5_user_weights_memory
+            = memory({ { conv5_weights_tz }, dt::f32, tag::goihw }, eng,
+                    conv5_weights.data());
+    auto conv5_user_bias_memory = memory(
+            { { conv5_bias_tz }, dt::f32, tag::x }, eng, conv5_bias.data());
+
+    // create memory descriptors for convolution data w/ no specified format
+    auto conv5_src_md = memory::desc({ conv5_src_tz }, dt::f32, tag::any);
+    auto conv5_weights_md
+            = memory::desc({ conv5_weights_tz }, dt::f32, tag::any);
+    auto conv5_bias_md = memory::desc({ conv5_bias_tz }, dt::f32, tag::any);
+    auto conv5_dst_md = memory::desc({ conv5_dst_tz }, dt::f32, tag::any);
+
+    // create a convolution
+    auto conv5_desc = convolution_forward::desc(prop_kind::forward_inference,
+            algorithm::convolution_direct, conv5_src_md, conv5_weights_md, conv5_bias_md,
+            conv5_dst_md, conv5_strides, conv5_padding, conv5_padding);
+    auto conv5_prim_desc = convolution_forward::primitive_desc(conv5_desc, eng);
+
+    auto conv5_src_memory = conv4_dst_memory;
+    if (conv5_prim_desc.src_desc() != conv5_src_memory.get_desc()) {
+        conv5_src_memory = memory(conv5_prim_desc.src_desc(), eng);
+        net.push_back(reorder(conv4_dst_memory, conv5_src_memory));
+        net_args.push_back({ { DNNL_ARG_FROM, conv4_dst_memory },
+                { DNNL_ARG_TO, conv5_src_memory } });
+    }
+
+    auto conv5_weights_memory = conv5_user_weights_memory;
+    if (conv5_prim_desc.weights_desc()
+            != conv5_user_weights_memory.get_desc()) {
+        conv5_weights_memory = memory(conv5_prim_desc.weights_desc(), eng);
+        reorder(conv5_user_weights_memory, conv5_weights_memory)
+                .execute(s, conv5_user_weights_memory, conv5_weights_memory);
+    }
+
+    auto conv5_dst_memory = memory(conv5_prim_desc.dst_desc(), eng);
+
+    // create convolution primitive and add it to net
+    net.push_back(convolution_forward(conv5_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_src_memory },
+            { DNNL_ARG_WEIGHTS, conv5_weights_memory },
+            { DNNL_ARG_BIAS, conv5_user_bias_memory },
+            { DNNL_ARG_DST, conv5_dst_memory } });
+
+    // AlexNet: relu5
+    // {batch, 256, 13, 13} -> {batch, 256, 13, 13}
+    const float negative5_slope = 1.0f;
+
+    // create relu primitive and add it to net
+    auto relu5_desc = eltwise_forward::desc(prop_kind::forward_inference,
+            algorithm::eltwise_relu, conv5_dst_memory.get_desc(),
+            negative5_slope);
+    auto relu5_prim_desc = eltwise_forward::primitive_desc(relu5_desc, eng);
+
+    net.push_back(eltwise_forward(relu5_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_dst_memory },
+            { DNNL_ARG_DST, conv5_dst_memory } });
+
+    // AlexNet: pool5
+    // {batch, 256, 13, 13} -> {batch, 256, 6, 6}
+    // kernel: {3, 3}
+    // strides: {2, 2}
+    memory::dims pool5_dst_tz = { batch, 256, 6, 6 };
+    memory::dims pool5_kernel = { 3, 3 };
+    memory::dims pool5_strides = { 2, 2 };
+    memory::dims pool5_padding = { 0, 0 };
+
+    std::vector<float> pool5_dst(product(pool5_dst_tz));
+
+    auto pool5_dst_md = memory::desc({ pool5_dst_tz }, dt::f32, tag::any);
+
+    // create a pooling
+    auto pool5_desc = pooling_forward::desc(prop_kind::forward_inference,
+            algorithm::pooling_max, conv5_dst_memory.get_desc(), pool5_dst_md,
+            pool5_strides, pool5_kernel, pool5_padding, pool5_padding);
+    auto pool5_pd = pooling_forward::primitive_desc(pool5_desc, eng);
+
+    auto pool5_dst_memory = memory(pool5_pd.dst_desc(), eng);
+
+    // create pooling primitive an add it to net
+    net.push_back(pooling_forward(pool5_pd));
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_dst_memory },
+            { DNNL_ARG_DST, pool5_dst_memory } });
+
+
+    // fc6 inner product {batch, 256, 6, 6} (x) {4096, 256, 6, 6}-> {batch,
+    // 4096}
+    memory::dims fc6_src_tz = { batch, 256, 6, 6 };
+    memory::dims fc6_weights_tz = { 4096, 256, 6, 6 };
+    memory::dims fc6_bias_tz = { 4096 };
+    memory::dims fc6_dst_tz = { batch, 4096 };
+
+    std::vector<float> fc6_weights(product(fc6_weights_tz));
+    std::vector<float> fc6_bias(product(fc6_bias_tz));
+
+    // create memory for user data
+    auto fc6_user_weights_memory
+            = memory({ { fc6_weights_tz }, dt::f32, tag::oihw }, eng,
+                    fc6_weights.data());
+    auto fc6_user_bias_memory = memory(
+            { { fc6_bias_tz }, dt::f32, tag::x }, eng, fc6_bias.data());
+
+    // create memory descriptors for convolution data w/ no specified format
+    auto fc6_src_md = memory::desc({ fc6_src_tz }, dt::f32, tag::any);
+    auto fc6_bias_md = memory::desc({ fc6_bias_tz }, dt::f32, tag::any);
+    auto fc6_weights_md = memory::desc({ fc6_weights_tz }, dt::f32, tag::any);
+    auto fc6_dst_md = memory::desc({ fc6_dst_tz }, dt::f32, tag::any);
+
+    // create a inner_product
+    auto fc6_desc = inner_product_forward::desc(prop_kind::forward_inference,
+            fc6_src_md, fc6_weights_md, fc6_bias_md, fc6_dst_md);
+    auto fc6_prim_desc = inner_product_forward::primitive_desc(fc6_desc, eng);
+
+    auto fc6_src_memory = pool5_dst_memory;
+    if (fc6_prim_desc.src_desc() != fc6_src_memory.get_desc()) {
+        fc6_src_memory = memory(fc6_prim_desc.src_desc(), eng);
+        net.push_back(reorder(pool5_dst_memory, fc6_src_memory));
+        net_args.push_back({ { DNNL_ARG_FROM, pool5_dst_memory },
+                { DNNL_ARG_TO, fc6_src_memory } });
+    }
+
+    auto fc6_weights_memory = fc6_user_weights_memory;
+    if (fc6_prim_desc.weights_desc() != fc6_user_weights_memory.get_desc()) {
+        fc6_weights_memory = memory(fc6_prim_desc.weights_desc(), eng);
+        reorder(fc6_user_weights_memory, fc6_weights_memory)
+                .execute(s, fc6_user_weights_memory, fc6_weights_memory);
+    }
+
+    auto fc6_dst_memory = memory(fc6_prim_desc.dst_desc(), eng);
+
+    // create convolution primitive and add it to net
+    net.push_back(inner_product_forward(fc6_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, fc6_src_memory },
+            { DNNL_ARG_WEIGHTS, fc6_weights_memory },
+            { DNNL_ARG_BIAS, fc6_user_bias_memory },
+            { DNNL_ARG_DST, fc6_dst_memory } });
+
+
+    // fc7 inner product {batch, 4096} (x) {4096, 4096}-> {batch, 4096}
+    memory::dims fc7_weights_tz = { 4096, 4096 };
+    memory::dims fc7_bias_tz = { 4096 };
+    memory::dims fc7_dst_tz = { batch, 4096 };
+
+    std::vector<float> fc7_weights(product(fc7_weights_tz));
+    std::vector<float> fc7_bias(product(fc7_bias_tz));
+
+    // create memory for user data
+    auto fc7_user_weights_memory = memory(
+            { { fc7_weights_tz }, dt::f32, tag::nc }, eng, fc7_weights.data());
+
+    auto fc7_user_bias_memory = memory(
+            { { fc7_bias_tz }, dt::f32, tag::x }, eng, fc7_bias.data());
+
+    // create memory descriptors for convolution data w/ no specified format
+    auto fc7_bias_md = memory::desc({ fc7_bias_tz }, dt::f32, tag::any);
+    auto fc7_weights_md = memory::desc({ fc7_weights_tz }, dt::f32, tag::any);
+    auto fc7_dst_md = memory::desc({ fc7_dst_tz }, dt::f32, tag::any);
+
+    // create a inner_product
+    auto fc7_desc = inner_product_forward::desc(prop_kind::forward_inference,
+            fc6_dst_memory.get_desc(), fc7_weights_md, fc7_bias_md, fc7_dst_md);
+    auto fc7_prim_desc = inner_product_forward::primitive_desc(fc7_desc, eng);
+
+    auto fc7_weights_memory = fc7_user_weights_memory;
+    if (fc7_prim_desc.weights_desc() != fc7_user_weights_memory.get_desc()) {
+        fc7_weights_memory = memory(fc7_prim_desc.weights_desc(), eng);
+        reorder(fc7_user_weights_memory, fc7_weights_memory)
+                .execute(s, fc7_user_weights_memory, fc7_weights_memory);
+    }
+
+    auto fc7_dst_memory = memory(fc7_prim_desc.dst_desc(), eng);
+
+    // create convolution primitive and add it to net
+    net.push_back(inner_product_forward(fc7_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, fc6_dst_memory },
+            { DNNL_ARG_WEIGHTS, fc7_weights_memory },
+            { DNNL_ARG_BIAS, fc7_user_bias_memory },
+            { DNNL_ARG_DST, fc7_dst_memory } });
+
+    // fc8 inner product {batch, 4096} (x) {1000, 4096}-> {batch, 1000}
+    memory::dims fc8_weights_tz = { 1000, 4096 };
+    memory::dims fc8_bias_tz = { 1000 };
+    memory::dims fc8_dst_tz = { batch, 1000 };
+
+    std::vector<float> fc8_weights(product(fc8_weights_tz));
+    std::vector<float> fc8_bias(product(fc8_bias_tz));
+
+    // create memory for user data
+    auto fc8_user_weights_memory = memory(
+            { { fc8_weights_tz }, dt::f32, tag::nc }, eng, fc8_weights.data());
+    auto fc8_user_bias_memory = memory(
+            { { fc8_bias_tz }, dt::f32, tag::x }, eng, fc8_bias.data());
+    auto user_dst_memory = memory(
+            { { fc8_dst_tz }, dt::f32, tag::nc }, eng, user_dst.data());
+
+    // create memory descriptors for convolution data w/ no specified format
+    auto fc8_bias_md = memory::desc({ fc8_bias_tz }, dt::f32, tag::any);
+    auto fc8_weights_md = memory::desc({ fc8_weights_tz }, dt::f32, tag::any);
+    auto fc8_dst_md = memory::desc({ fc8_dst_tz }, dt::f32, tag::any);
+
+    // create a inner_product
+    auto fc8_desc = inner_product_forward::desc(prop_kind::forward_inference,
+            fc7_dst_memory.get_desc(), fc8_weights_md, fc8_bias_md, fc8_dst_md);
+    auto fc8_prim_desc = inner_product_forward::primitive_desc(fc8_desc, eng);
+
+    auto fc8_weights_memory = fc8_user_weights_memory;
+    if (fc8_prim_desc.weights_desc() != fc8_user_weights_memory.get_desc()) {
+        fc8_weights_memory = memory(fc8_prim_desc.weights_desc(), eng);
+        reorder(fc8_user_weights_memory, fc8_weights_memory)
+                .execute(s, fc8_user_weights_memory, fc8_weights_memory);
+    }
+
+    auto fc8_dst_memory = memory(fc8_prim_desc.dst_desc(), eng);
+
+    // create convolution primitive and add it to net
+    net.push_back(inner_product_forward(fc8_prim_desc));
+    net_args.push_back({ { DNNL_ARG_SRC, fc7_dst_memory },
+            { DNNL_ARG_WEIGHTS, fc8_weights_memory },
+            { DNNL_ARG_BIAS, fc8_user_bias_memory },
+            { DNNL_ARG_DST, fc8_dst_memory } });
+
+    // create reorder between internal and user data if it is needed and
+    // add it to net after pooling
+    if (fc8_dst_memory != user_dst_memory) {
+        net.push_back(reorder(fc8_dst_memory, user_dst_memory));
+        net_args.push_back({ { DNNL_ARG_FROM, fc8_dst_memory },
+                { DNNL_ARG_TO, user_dst_memory } });
+    }
+
+/// @page cpu_cnn_inference_f32_cpp
+/// Finally, execute the primitives. For this example, the net is executed
+/// multiple times and each execution is timed individually.
+/// @snippet cpu_cnn_inference_f32.cpp Execute model
+//[Execute model]
+    for (int j = 0; j < times; ++j) {
+        assert(net.size() == net_args.size() && "something is missing");
+        for (size_t i = 0; i < net.size(); ++i)
+            net.at(i).execute(s, net_args.at(i));
+    }
+//[Execute model]
+
+    s.wait();
+}
+
+int main(int argc, char **argv) {
+    try {
+        auto begin = chrono::duration_cast<chrono::milliseconds>(
+                chrono::steady_clock::now().time_since_epoch())
+                             .count();
+        int times = 100;
+        simple_net(times);
+        auto end = chrono::duration_cast<chrono::milliseconds>(
+                chrono::steady_clock::now().time_since_epoch())
+                           .count();
+        cout << "Use time " << (end - begin) / (times + 0.0) << "\n";
+    } catch (error &e) {
+        std::cerr << "status: " << e.status << std::endl;
+        std::cerr << "message: " << e.message << std::endl;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/Libraries/oneDNN/tutorials/codes_for_ipynb/cnn_inference_f32.patch b/Libraries/oneDNN/tutorials/codes_for_ipynb/cnn_inference_f32.patch
new file mode 100644
index 0000000000..c76bb2a859
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/codes_for_ipynb/cnn_inference_f32.patch
@@ -0,0 +1,245 @@
+--- cnn_inference_f32.cpp	2020-02-12 10:12:10.467690007 -0800
++++ cnn_inference_f32_gpu.cpp	2020-02-12 10:12:28.395690295 -0800
+@@ -19,12 +19,62 @@
+             std::multiplies<memory::dim>());
+ }
+ 
++
++// ------ GPU code conversion --Step 2 >>>>>>
++// Read from handle, write to memory
++inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {
++
++    dnnl::engine eng = mem.get_engine();
++    size_t size = mem.get_desc().get_size();
++
++    bool is_cpu_sycl = (DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
++            && eng.get_kind() == dnnl::engine::kind::cpu);
++    bool is_gpu_sycl = (DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
++            && eng.get_kind() == dnnl::engine::kind::gpu);
++    if (is_cpu_sycl || is_gpu_sycl) {
++
++        auto buffer = mem.get_sycl_buffer<uint8_t>();
++        auto dst = buffer.get_access<cl::sycl::access::mode::write>();
++        uint8_t *dst_ptr = dst.get_pointer();
++
++        if (!dst_ptr || !handle) {
++            std::cerr << "memory is NULL"
++                      << "\n";
++            return;
++        }
++        for (size_t i = 0; i < size; ++i)
++            dst_ptr[i] = ((uint8_t *)handle)[i];
++        return;
++    }
++
++    if (eng.get_kind() == dnnl::engine::kind::cpu) {
++        uint8_t *dst = static_cast<uint8_t *>(mem.get_data_handle());
++        if (!dst || !handle) {
++            std::cerr << "memory is NULL"
++                      << "\n";
++            return;
++        }
++        for (size_t i = 0; i < size; ++i)
++            dst[i] = ((uint8_t *)handle)[i];
++        return;
++    }
++
++    assert(!"not expected");
++}  
++//<<<<<< ------ GPU code conversion --Step 2
++
++
++
+ void simple_net(int times = 100) {
+     using tag = memory::format_tag;
+     using dt = memory::data_type;
+ 
+ 
+-    engine eng(engine::kind::cpu, 0);
++// ------ GPU code conversion --Step 1 >>>>>>
++    engine eng(engine::kind::gpu, 0);
++//<<< <<<------ GPU code conversion --Step 1
++
++    
+     stream s(eng);
+ 
+     std::vector<primitive> net;
+@@ -53,13 +103,17 @@
+ //[Allocate buffers]
+ 
+ 
++// ------ GPU code conversion --Step 3 >>>>>>
+     auto user_src_memory = memory(
+-            { { conv1_src_tz }, dt::f32, tag::nchw }, eng, user_src.data());
++            { { conv1_src_tz }, dt::f32, tag::nchw }, eng);
++    write_to_dnnl_memory(user_src.data(), user_src_memory);
+     auto user_weights_memory
+-            = memory({ { conv1_weights_tz }, dt::f32, tag::oihw }, eng,
+-                    conv1_weights.data());
++            = memory({ { conv1_weights_tz }, dt::f32, tag::oihw }, eng);
++    write_to_dnnl_memory(conv1_weights.data(), user_weights_memory);
+     auto conv1_user_bias_memory = memory(
+-            { { conv1_bias_tz }, dt::f32, tag::x }, eng, conv1_bias.data());
++            { { conv1_bias_tz }, dt::f32, tag::x }, eng);
++    write_to_dnnl_memory(conv1_bias.data(), conv1_user_bias_memory);
++//<<<<<< ------ GPU code conversion --Step 3
+ 
+ 
+ 
+@@ -175,13 +229,18 @@
+     std::vector<float> conv2_weights(product(conv2_weights_tz));
+     std::vector<float> conv2_bias(product(conv2_bias_tz));
+ 
++
++// ------ GPU code conversion --Step 3 >>>>>>
+     // create memory for user data
+     auto conv2_user_weights_memory
+-            = memory({ { conv2_weights_tz }, dt::f32, tag::goihw }, eng,
+-                    conv2_weights.data());
++            = memory({ { conv2_weights_tz }, dt::f32, tag::goihw }, eng);
++    write_to_dnnl_memory(conv2_weights.data(), conv2_user_weights_memory);
+     auto conv2_user_bias_memory = memory(
+-            { { conv2_bias_tz }, dt::f32, tag::x }, eng, conv2_bias.data());
++            { { conv2_bias_tz }, dt::f32, tag::x }, eng);
++    write_to_dnnl_memory(conv2_bias.data(), conv2_user_bias_memory);
++//<<<<<< ------ GPU code conversion --Step 3
+ 
++    
+     // create memory descriptors for convolution data w/ no specified format
+     auto conv2_src_md = memory::desc({ conv2_src_tz }, dt::f32, tag::any);
+     auto conv2_bias_md = memory::desc({ conv2_bias_tz }, dt::f32, tag::any);
+@@ -291,13 +350,18 @@
+     std::vector<float> conv3_weights(product(conv3_weights_tz));
+     std::vector<float> conv3_bias(product(conv3_bias_tz));
+ 
++
++// ------ GPU code conversion --Step 3 >>>>>>
+     // create memory for user data
+     auto conv3_user_weights_memory
+-            = memory({ { conv3_weights_tz }, dt::f32, tag::oihw }, eng,
+-                    conv3_weights.data());
++            = memory({ { conv3_weights_tz }, dt::f32, tag::oihw }, eng);
++    write_to_dnnl_memory(conv3_weights.data(), conv3_user_weights_memory);
+     auto conv3_user_bias_memory = memory(
+-            { { conv3_bias_tz }, dt::f32, tag::x }, eng, conv3_bias.data());
++            { { conv3_bias_tz }, dt::f32, tag::x }, eng);
++    write_to_dnnl_memory(conv3_bias.data(), conv3_user_bias_memory);
++//<<<<<< ------ GPU code conversion --Step 3
+ 
++    
+     // create memory descriptors for convolution data w/ no specified format
+     auto conv3_src_md = memory::desc({ conv3_src_tz }, dt::f32, tag::any);
+     auto conv3_bias_md = memory::desc({ conv3_bias_tz }, dt::f32, tag::any);
+@@ -364,13 +428,17 @@
+     std::vector<float> conv4_weights(product(conv4_weights_tz));
+     std::vector<float> conv4_bias(product(conv4_bias_tz));
+ 
++// ------ GPU code conversion --Step 3 >>>>>>
+     // create memory for user data
+     auto conv4_user_weights_memory
+-            = memory({ { conv4_weights_tz }, dt::f32, tag::goihw }, eng,
+-                    conv4_weights.data());
++            = memory({ { conv4_weights_tz }, dt::f32, tag::goihw }, eng);
++    write_to_dnnl_memory(conv4_weights.data(), conv4_user_weights_memory);
+     auto conv4_user_bias_memory = memory(
+-            { { conv4_bias_tz }, dt::f32, tag::x }, eng, conv4_bias.data());
++            { { conv4_bias_tz }, dt::f32, tag::x }, eng);
++    write_to_dnnl_memory(conv4_bias.data(), conv4_user_bias_memory);
++//<<<<<< ------ GPU code conversion --Step 3
+ 
++    
+     // create memory descriptors for convolution data w/ no specified format
+     auto conv4_src_md = memory::desc({ conv4_src_tz }, dt::f32, tag::any);
+     auto conv4_bias_md = memory::desc({ conv4_bias_tz }, dt::f32, tag::any);
+@@ -436,13 +504,18 @@
+     std::vector<float> conv5_weights(product(conv5_weights_tz));
+     std::vector<float> conv5_bias(product(conv5_bias_tz));
+ 
++
++// ------ GPU code conversion --Step 3 >>>>>>
+     // create memory for user data
+     auto conv5_user_weights_memory
+-            = memory({ { conv5_weights_tz }, dt::f32, tag::goihw }, eng,
+-                    conv5_weights.data());
++            = memory({ { conv5_weights_tz }, dt::f32, tag::goihw }, eng);
++    write_to_dnnl_memory(conv5_weights.data(), conv5_user_weights_memory);
+     auto conv5_user_bias_memory = memory(
+-            { { conv5_bias_tz }, dt::f32, tag::x }, eng, conv5_bias.data());
++            { { conv5_bias_tz }, dt::f32, tag::x }, eng);
++    write_to_dnnl_memory(conv5_bias.data(), conv5_user_bias_memory);
++//<<<<<< ------ GPU code conversion --Step 3
+ 
++    
+     // create memory descriptors for convolution data w/ no specified format
+     auto conv5_src_md = memory::desc({ conv5_src_tz }, dt::f32, tag::any);
+     auto conv5_weights_md
+@@ -532,13 +605,18 @@
+     std::vector<float> fc6_weights(product(fc6_weights_tz));
+     std::vector<float> fc6_bias(product(fc6_bias_tz));
+ 
++
++// ------ GPU code conversion --Step 3 >>>>>>
+     // create memory for user data
+     auto fc6_user_weights_memory
+-            = memory({ { fc6_weights_tz }, dt::f32, tag::oihw }, eng,
+-                    fc6_weights.data());
++            = memory({ { fc6_weights_tz }, dt::f32, tag::oihw }, eng);
++    write_to_dnnl_memory(fc6_weights.data(), fc6_user_weights_memory);
+     auto fc6_user_bias_memory = memory(
+-            { { fc6_bias_tz }, dt::f32, tag::x }, eng, fc6_bias.data());
++            { { fc6_bias_tz }, dt::f32, tag::x }, eng);
++    write_to_dnnl_memory(fc6_bias.data(), fc6_user_bias_memory);
++//<<<<<< ------ GPU code conversion --Step 3
+ 
++    
+     // create memory descriptors for convolution data w/ no specified format
+     auto fc6_src_md = memory::desc({ fc6_src_tz }, dt::f32, tag::any);
+     auto fc6_bias_md = memory::desc({ fc6_bias_tz }, dt::f32, tag::any);
+@@ -583,13 +661,18 @@
+     std::vector<float> fc7_weights(product(fc7_weights_tz));
+     std::vector<float> fc7_bias(product(fc7_bias_tz));
+ 
++
++// ------ GPU code conversion --Step 3 >>>>>>
+     // create memory for user data
+     auto fc7_user_weights_memory = memory(
+-            { { fc7_weights_tz }, dt::f32, tag::nc }, eng, fc7_weights.data());
+-
++            { { fc7_weights_tz }, dt::f32, tag::nc }, eng);
++    write_to_dnnl_memory(fc7_weights.data(), fc7_user_weights_memory);
+     auto fc7_user_bias_memory = memory(
+-            { { fc7_bias_tz }, dt::f32, tag::x }, eng, fc7_bias.data());
+-
++            { { fc7_bias_tz }, dt::f32, tag::x }, eng);
++    write_to_dnnl_memory(fc7_bias.data(), fc7_user_bias_memory);
++//<<<<<< ------ GPU code conversion --Step 3
++ 
++    
+     // create memory descriptors for convolution data w/ no specified format
+     auto fc7_bias_md = memory::desc({ fc7_bias_tz }, dt::f32, tag::any);
+     auto fc7_weights_md = memory::desc({ fc7_weights_tz }, dt::f32, tag::any);
+@@ -624,14 +707,20 @@
+     std::vector<float> fc8_weights(product(fc8_weights_tz));
+     std::vector<float> fc8_bias(product(fc8_bias_tz));
+ 
++// ------ GPU code conversion --Step 3 >>>>>>
+     // create memory for user data
+     auto fc8_user_weights_memory = memory(
+-            { { fc8_weights_tz }, dt::f32, tag::nc }, eng, fc8_weights.data());
++            { { fc8_weights_tz }, dt::f32, tag::nc }, eng);
++    write_to_dnnl_memory(fc8_weights.data(), fc8_user_weights_memory);
+     auto fc8_user_bias_memory = memory(
+-            { { fc8_bias_tz }, dt::f32, tag::x }, eng, fc8_bias.data());
++            { { fc8_bias_tz }, dt::f32, tag::x }, eng);
++    write_to_dnnl_memory(fc8_bias.data(), fc8_user_bias_memory);
+     auto user_dst_memory = memory(
+-            { { fc8_dst_tz }, dt::f32, tag::nc }, eng, user_dst.data());
+-
++            { { fc8_dst_tz }, dt::f32, tag::nc }, eng);
++    write_to_dnnl_memory(user_dst.data(), user_dst_memory);
++//<<<<<< ------ GPU code conversion --Step 3
++    
++    
+     // create memory descriptors for convolution data w/ no specified format
+     auto fc8_bias_md = memory::desc({ fc8_bias_tz }, dt::f32, tag::any);
+     auto fc8_weights_md = memory::desc({ fc8_weights_tz }, dt::f32, tag::any);
diff --git a/Libraries/oneDNN/tutorials/getting_started.ipynb b/Libraries/oneDNN/tutorials/getting_started.ipynb
new file mode 100644
index 0000000000..f87476aae2
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/getting_started.ipynb
@@ -0,0 +1,561 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 1.1 - Introduction to Intel® oneAPI Deep Neural Network Library (oneDNN)  - Getting Started\n",
+    "\n",
+    "## Learning Objectives\n",
+    "In this module the developer will:\n",
+    "* Learn different oneDNN configurations inside the Intel® oneAPI toolkit\n",
+    "* Learn how to compile a oneDNN sample with different configurations via batch jobs on the Intel® DevCloud for oneAPI or in local environments.\n",
+    "* Learn how to program oneDNN with a simple sample\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# Getting Started Sample Exercise\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## introduce oneDNN configurations inside Intel oneAPI toolkits\n",
+    "oneDNN has four different configurations inside the Intel oneAPI toolkits. Each configuration is in a different folder under the oneDNN installation path, and each configurations supports a different compiler or threading library.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the installation path of your oneAPI toolkit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ONEAPI_INSTALL=/opt/intel/oneapi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!printf '%s\\n'     $ONEAPI_INSTALL/oneDNN/latest/cpu_*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you can see, there are four different folders under the oneDNN installation path, and each of those configurations supports different features. This tutorial will show you how to compile and run against different oneDNN configurations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, create a lab folder for this exercise."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir lab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Preparing the getting_started.cpp code\n",
+    "\n",
+    "This exercise use the getting_started.cpp example from oneDNN installation path.\n",
+    "\n",
+    "First, below section will copy the getting_started.cpp file into lab folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cp $ONEAPI_INSTALL/oneDNN/latest/cpu_dpcpp_gpu_dpcpp/examples/getting_started.cpp lab/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Users can browser source codes by running below section, and below section also remove comments for readability."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cpp -fpreprocessed  -dD -E lab/getting_started.cpp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, copy the required header files and CMake file into the lab folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cp $ONEAPI_INSTALL/oneDNN/latest/cpu_dpcpp_gpu_dpcpp/examples/example_utils.hpp lab/\n",
+    "!cp $ONEAPI_INSTALL/oneDNN/latest/cpu_dpcpp_gpu_dpcpp/examples/example_utils.h lab/\n",
+    "!cp $ONEAPI_INSTALL/oneDNN/latest/cpu_dpcpp_gpu_dpcpp/examples/CMakeLists.txt lab/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Build and Run with oneAPI DPC++ Compiler \n",
+    "one of the oneDNN configurations supports oneAPI DPC++ compiler, and it can run on different architectures by using DPC++.\n",
+    "The following section shows you how to build with DPC++ and run on different architectures."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler **dpcpp** command and flags that will generate the exectuable.\n",
+    "In order to use DPC++ compiler and related SYCL runtime, some definitions must be passed as cmake arguments.\n",
+    "Here are related cmake arguments for DPC++ configuration : \n",
+    "\n",
+    "   -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp -DDNNL_CPU_RUNTIME=SYCL -DDNNL_GPU_RUNTIME=SYCL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --force> /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir dpcpp\n",
+    "cd dpcpp\n",
+    "cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp -DDNNL_CPU_RUNTIME=SYCL -DDNNL_GPU_RUNTIME=SYCL\n",
+    "make getting-started-cpp\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the DevCloud or a local machine.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "the script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "By default, the built program uses CPU as the execution engine, but the user can switch to GPU by giving an input argument \"gpu\".\n",
+    "The user can refer run.sh below to run on GPU.\n",
+    "To run on CPU, simply remove the input argument \"gpu\" ."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "./dpcpp/out/getting-started-cpp cpu\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit the **build.sh** and **run.sh** to the job queue.\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts both on the Intel DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails, it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "! rm -rf dpcpp;chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Build and Run with GNU Compiler and OpenMP \n",
+    "One of the oneDNN configurations supports GNU compilers, but it can run only on CPU.\n",
+    "The following section shows you how to build with G++ and run on CPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler command and flags that will generate the exectuable.\n",
+    "The user must switch to the G++ oneDNN configurations by inputting a custom configuration \"--dnnl-configuration=cpu_gomp\" when running \"source setvars.sh\".\n",
+    "In order to use the G++ compiler and related OMP runtime, some definitions must be passed as cmake arguments.\n",
+    "Here are related cmake arguments for DPC++ configuration : \n",
+    "\n",
+    "  -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DDNNL_CPU_RUNTIME=OMP -DDNNL_GPU_RUNTIME=NONE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_gomp --force> /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir cpu_gomp\n",
+    "cd cpu_gomp\n",
+    "cmake .. -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DDNNL_CPU_RUNTIME=OMP -DDNNL_GPU_RUNTIME=NONE\n",
+    "make getting-started-cpp\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the DevCloud or in local environments.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "the script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "The user must switch to the G++ oneDNN configuration by inputting a custom configuration \"--dnnl-configuration=cpu_gomp\" when running \"source setvars.sh\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_gomp --force> /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "./cpu_gomp/out/getting-started-cpp\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit the **build.sh** and **run.sh** to the job queue.\n",
+    "\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts both on the DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails, it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "! rm -rf cpu_gomp;chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Build and Run with Intel Compiler and OpenMP\n",
+    "One of the oneDNN configurations supports Intel compilers, but it can run only on CPU.\n",
+    "The following section shows you how to build with ICC and run on CPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> NOTE : This section is optional and it is for developers who want to use Intel Compiler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler command and flags that will generate the executable.\n",
+    "The user must switch to the ICC oneDNN configuration by inputting a custom configuration \"--dnnl-configuration=cpu_iomp\" when running \"source setvars.sh\".\n",
+    "In order to use ICC compiler and related OMP runtime, some definitions must be passed as cmake arguments.\n",
+    "Here are related cmake arguments for DPC++ configuration : \n",
+    "\n",
+    "  -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DDNNL_CPU_RUNTIME=OMP -DDNNL_GPU_RUNTIME=NONE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_iomp --force> /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir cpu_iomp\n",
+    "cd cpu_iomp\n",
+    "cmake .. -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DDNNL_CPU_RUNTIME=OMP -DDNNL_GPU_RUNTIME=NONE\n",
+    "make getting-started-cpp\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the DevCloud or in local environments.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "The script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "The user must switch to the ICC oneDNN configuration by inputting a custom configuration \"--dnnl-configuration=cpu_iomp\" when running \"source setvars.sh\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_iomp --force> /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "./cpu_iomp/out/getting-started-cpp\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit the **build.sh** and **run.sh** to the job queue.\n",
+    "\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts both on the DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "! chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "# Build and Run with GNU Compiler and oneTBB \n",
+    "One of the oneDNN configurations supports Intel® oneAPI Threading bBuilding block Blocks (oneTBB) as its threading library, but it can run only on CPU.\n",
+    "The following section shows you how to build with oneTBB and run on CPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> NOTE : This section is optional and it is for developers who want to use Intel oneTBB"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler **dpcpp** command and flags that will generate the executable.\n",
+    "The user must switch to the G++ oneDNN configuration by inputting a custom configuration \"--dnnl-configuration=cpu_gomp\" when running \"source setvars.sh\".\n",
+    "In order to use G++ compiler and related OMP runtime, some definitions must be passed as cmake arguments.\n",
+    "Here are related cmake arguments for DPC++ configuration : \n",
+    "\n",
+    "  -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DDNNL_CPU_RUNTIME=TBB -DDNNL_GPU_RUNTIME=NONE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_tbb --force> /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir cpu_tbb\n",
+    "cd cpu_tbb\n",
+    "cmake .. -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DDNNL_CPU_RUNTIME=TBB -DDNNL_GPU_RUNTIME=NONE\n",
+    "make getting-started-cpp\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the DevCloud or a local machine.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "The script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "The user must switch to the oneDNN with oneTBB threading configuration by inputting a custom configuration \"--dnnl-configuration=cpu_tbb\" when running \"source setvars.sh\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_tbb --force> /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "./cpu_tbb/out/getting-started-cpp\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit the **build.sh** and **run.sh** to the job queue.\n",
+    "\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts both on the DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails, it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "! chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# Summary\n",
+    "In this lab the developer learned the following:\n",
+    "* What are the different oneDNN configurations inside the Intel oneAPI toolkits\n",
+    "* How to compile a oneDNN sample with different configurations via batch jobs on the Intel oneAPI DevCloud or in local environments\n",
+    "* How to program oneDNN with a simple sample\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "525.6px",
+    "left": "28px",
+    "top": "137.8px",
+    "width": "301.109px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Libraries/oneDNN/tutorials/images/bf16.JPG b/Libraries/oneDNN/tutorials/images/bf16.JPG
new file mode 100644
index 0000000000..cb82831853
Binary files /dev/null and b/Libraries/oneDNN/tutorials/images/bf16.JPG differ
diff --git a/Libraries/oneDNN/tutorials/images/cpu.JPG b/Libraries/oneDNN/tutorials/images/cpu.JPG
new file mode 100644
index 0000000000..ecb4065003
Binary files /dev/null and b/Libraries/oneDNN/tutorials/images/cpu.JPG differ
diff --git a/Libraries/oneDNN/tutorials/images/cpu_jit.JPG b/Libraries/oneDNN/tutorials/images/cpu_jit.JPG
new file mode 100644
index 0000000000..1938acfe29
Binary files /dev/null and b/Libraries/oneDNN/tutorials/images/cpu_jit.JPG differ
diff --git a/Libraries/oneDNN/tutorials/images/gpu.JPG b/Libraries/oneDNN/tutorials/images/gpu.JPG
new file mode 100644
index 0000000000..47a6ea0c23
Binary files /dev/null and b/Libraries/oneDNN/tutorials/images/gpu.JPG differ
diff --git a/Libraries/oneDNN/tutorials/images/gpu_kernel.JPG b/Libraries/oneDNN/tutorials/images/gpu_kernel.JPG
new file mode 100644
index 0000000000..fd0d6a870a
Binary files /dev/null and b/Libraries/oneDNN/tutorials/images/gpu_kernel.JPG differ
diff --git a/Libraries/oneDNN/tutorials/images/vnni.JPG b/Libraries/oneDNN/tutorials/images/vnni.JPG
new file mode 100644
index 0000000000..4fee7d6be6
Binary files /dev/null and b/Libraries/oneDNN/tutorials/images/vnni.JPG differ
diff --git a/Libraries/oneDNN/tutorials/profiling/README.md b/Libraries/oneDNN/tutorials/profiling/README.md
new file mode 100644
index 0000000000..f34b0858a0
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/profiling/README.md
@@ -0,0 +1,25 @@
+# oneDNN verbose log parser
+
+
+## prerequisites 
+
+
+*  users need to get a oneDNN verbose log from their workloads first.  
+
+## how to parse logs
+
+### Raw log from frameworks like tensorflow or pytorch
+*  parse a raw log "log.txt" from workload : `$profile profile_utils.py log.txt` 
+    *  users will see output from console
+    *  users will also get some pie chart diagram PNG files like typeTime Breakdown.png
+    *  users will also get a parsed output mkldnn_log.csv which only contains onednn logs
+
+### Pure oneDNN log or parsed ouput 'mkldnn_log.csv'
+*  parse a onednn log "mkldnn_log.csv" : `$profile profile_utils.py mkldnn_log.csv` 
+    * users will see output from console 
+    * users will also get some pie chart diagram PNG files like typeTime Breakdown.png
+
+### Compare two pure oneDNN logs 
+*  compare two onednn log "a.csv" and "b.csv" : `$profile profile_utils.py a.csv b.csv` 
+    * users will see output from console 
+    * users will also get a bar chart diagram PNG files like typeTime Comparison.png
\ No newline at end of file
diff --git a/Libraries/oneDNN/tutorials/profiling/__init__.py b/Libraries/oneDNN/tutorials/profiling/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/Libraries/oneDNN/tutorials/profiling/profile_utils.py b/Libraries/oneDNN/tutorials/profiling/profile_utils.py
new file mode 100755
index 0000000000..fdec405f6c
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/profiling/profile_utils.py
@@ -0,0 +1,206 @@
+#! /usr/bin/env python
+import os, sys
+import subprocess
+
+os.environ['DNNL_VERBOSE'] = '1'
+import psutil
+
+class PlatformUtils:
+
+    def __init_(self):
+        self.cpufreq = ''
+        self.cpu_socket_count = ''
+        self.svmem = ''
+        return
+
+    def dump_platform_info(self):
+        # let's print CPU information
+        print("=" * 20, "CPU Info", "=" * 20)
+        # number of cores
+        print("Physical cores:", psutil.cpu_count(logical=False))
+        print("Total cores:", psutil.cpu_count(logical=True))
+        # CPU frequencies
+        cpufreq = psutil.cpu_freq()
+        print("Max Frequency:", cpufreq.max)
+        print("Min Frequency:", cpufreq.min)
+        cpu_socket_count = int(subprocess.check_output(
+            'cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l', shell=True))
+        print("Socket Number:", cpu_socket_count)
+        print("=" * 20, "Memory Information", "=" * 20)
+        # get the memory details
+        svmem = psutil.virtual_memory()
+        print("Total: ", int(svmem.total / (1024 ** 3)), "GB")
+        self.cpufreq = cpufreq
+        self.cpu_socket_count = cpu_socket_count
+        self.svmem = svmem
+
+
+def run_workload(outfile='mkldnn_log.csv'):
+    print('Executing:', sys.argv[1:])
+    output = subprocess.getoutput(' '.join(sys.argv[1:]))
+
+    #print('Output:', output)
+
+    with open(outfile, 'w') as f:
+        for l in output.split('\n'):
+            if 'dnnl' in l and 'exec' in l:
+                f.write(l + '\n')
+
+class oneDNNLog:
+
+    def __init_(self):
+        self.filename = ''
+        self.data = None
+        self.exec_data = None
+        return
+
+    def load_log(self, log):
+        self.filename = log
+
+        data = self.load_log_dnnl(log)
+        count = data['time'].count()
+
+        if count == 0:
+            data = self.load_log_mkldnn(log)
+            count = data['time'].count()
+
+        exec_data = data[data['exec'] == 'exec']
+        self.data = data
+        self.exec_data = exec_data
+        return
+
+    def load_log_dnnl(self, log):
+        import pandas as pd
+        # dnnl_verbose,exec,cpu,convolution,jit:avx2,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb8a:f0 bia_f32::blocked:a:f0 dst_f32::blocked:aBcd8b:f0,,alg:convolution_direct,mb1_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,1.21704
+        data = pd.read_csv(log, names=[ 'dnnl_verbose','exec','arch','type', 'jit', 'pass', 'fmt', 'opt', 'alg', 'shape', 'time'])
+        return data
+
+    def load_log_mkldnn(self, log):
+        import pandas as pd
+        #mkldnn_verbose,exec,convolution,jit:avx512_common,forward_training,fsrc:nChw16c fwei:OIhw16i16o fbia:undef fdst:nChw16c,alg:convolution_direct,mb100_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1,0.201904
+        print("load_log_mkldnn")
+        data = pd.read_csv(log, names=[ 'mkldnn_verbose','exec','type', 'jit', 'pass', 'fmt', 'alg', 'shape', 'time'])
+        return data
+
+
+class oneDNNUtils:
+
+    def __init_(self):
+        self.topk=50
+        self.logx=True 
+        self.figsize=(10,10)
+        import matplotlib.pyplot as plt
+        fig = plt.figure(figsize=(18, 15))
+        self.ax = fig.add_subplot(111)
+        return
+
+    def breakdown(self, data, Group, Type):
+        import matplotlib.pyplot as plt
+        fig = plt.figure(figsize=(18, 15))
+        ax = fig.add_subplot(111)
+        figsize=(10,10)
+        topk=50
+        if Type == "time":
+            print()
+            print(' breakdown:',Group)
+            time = data.groupby(Group)['time'].sum().sort_values().head(topk)
+            print(time)
+            title=Group + "Time Breakdown"
+            time[:topk].plot.pie(
+                ax=ax, title=title, figsize=figsize, logx=True, autopct='%1.1f%%')
+            ax.figure.savefig(title)
+        elif Type == "count":
+            print()
+            count = data[Group].value_counts().head(topk)
+            print(count)
+            title=Group+"Count Breakdown"
+            count[:topk].plot.bar(
+                ax=ax, title=title, figsize=figsize, logx=False, rot=45)
+            ax.figure.savefig(title)
+        return
+
+    def stats_comp(self, name, Type,onednn_log1, onednn_log2, n=50):
+        import pandas as pd
+        import matplotlib.pyplot as plt
+        fig = plt.figure(figsize=(18, 15))
+        ax = fig.add_subplot(111)
+        figsize=(10,10)
+        topk=50
+
+        d1 = onednn_log1.exec_data
+        log1 = onednn_log1.filename
+        d2 = onednn_log2.exec_data
+        log2 = onednn_log2.filename
+        print(name, 'stats:')
+        if Type == "count":
+            jitstat = pd.concat((d1[name].value_counts(), d2[name].value_counts()), axis=1, sort=True)
+            jitstat.columns = ('1-' + log1, '2-' + log2)
+            jitstat['run2/run1'] = jitstat.iloc[:, 1] / jitstat.iloc[:, 0]
+            jitstat_count = jitstat.sort_values('1-' + log1, ascending=False).head(n)
+            print(jitstat_count)
+        elif Type == "time":
+            jitstat = pd.concat((d1.groupby(name)['time'].sum(), d2.groupby(name)['time'].sum()), axis=1, sort=True)
+            jitstat.columns = ('1-' + log1, '2-' + log2)
+            jitstat['run2/run1'] = jitstat.iloc[:, 1] / jitstat.iloc[:, 0]
+            jitstat_time = jitstat.sort_values('1-' + log1, ascending=False).head(n)
+            print(jitstat_time)
+            title=name + " run2/run1 Time Comparison"
+            jitstat_compare = jitstat_time.drop(columns=['1-' + log1, '2-' + log2])
+            if len(jitstat_compare) == 0:
+                return
+            jitstat_compare[:topk].plot.bar(
+                ax=ax, title=title, figsize=figsize, logx=False, rot=45)
+            filename = name + " Time Comparison"
+            ax.figure.savefig(filename)
+    def parse_raw_output_to_csv(self, filepath, csvpath='mkldnn_log.csv', keyword='dnnl_verbose'):
+        #filepath = 'Iliad.txt'
+        import csv
+
+        with open(csvpath, "w") as file:
+            with open(filepath) as fp:
+                line = fp.readline()
+                cnt = 1
+                while line:
+                    if line.find(keyword) != -1:
+                        file.write(line)
+                        #print("Line {}: {}".format(cnt, line.strip()))
+                    line = fp.readline()
+                    cnt += 1
+        return csvpath
+
+
+
+if __name__ == '__main__':
+    onednn = oneDNNUtils()
+    if len(sys.argv) > 2 and '.csv' in sys.argv[1] and '.csv' in sys.argv[2]:
+        log1 = oneDNNLog()
+        log1.load_log(sys.argv[1])
+        log2 = oneDNNLog()
+        log2.load_log(sys.argv[2])
+        print('Total time %s: %0.2f\t---  %s: %0.2f' % (log1.filename, log1.data['time'].sum(), log2.filename, log2.data['time'].sum()))
+        print('Total  ops  %s: %d\t\t---  %s: %d'    % (log1.filename, log1.data['time'].count(), log2.filename, log2.data['time'].count()))
+        #onednn.stats_comp('jit', 'time',log1, log2)
+
+        print()
+        onednn.stats_comp('type', 'time',log1, log2)
+
+        #print()
+        #onednn.stats_comp('shape', 'time',log1, log2)
+
+    elif len(sys.argv) > 1 and '.csv' in sys.argv[1]:
+        log = oneDNNLog()
+        log.load_log(sys.argv[1])
+        print('Total MKLDNN time:', log.data['time'].sum())
+        print('Total MKLDNN ops:', log.data['time'].count())
+        onednn.breakdown(log.exec_data,"type","time")
+        onednn.breakdown(log.exec_data,"jit","time")
+    elif len(sys.argv) > 1:
+        keyword = "_verbose"
+        csvpath = onednn.parse_raw_output_to_csv(sys.argv[1], keyword=keyword)
+        print(csvpath)
+        log = oneDNNLog()
+        log.load_log(csvpath)
+        print('Total MKLDNN time:', log.data['time'].sum())
+        print('Total MKLDNN ops:', log.data['time'].count())
+        onednn.breakdown(log.exec_data,"type","time")
+        onednn.breakdown(log.exec_data,"jit","time")
diff --git a/Libraries/oneDNN/tutorials/q b/Libraries/oneDNN/tutorials/q
new file mode 100755
index 0000000000..98c1f7759a
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/q
@@ -0,0 +1,41 @@
+#!/bin/bash
+#========================================
+# Script to submit job in Intel devcloud
+#
+# Version: 0.5
+#========================================
+#property='gpu'
+#property='clx'
+#property='skx'
+if [ -z "$property" ]; then
+    property='gpu'
+fi
+
+if [ -z "$1" ]; then
+	echo "Missing script argument, Usage: ./q run.sh"
+elif [ ! -f "$1" ]; then
+    echo "File $1 does not exist"
+else
+	script=$1
+	rm *.sh.* > /dev/null 2>&1
+	#qsub
+	echo "Submitting job:"
+	qsub -l nodes=1:$property:ppn=2 -d . $script
+	# qsub -q batch@v-qsvr-nda-l nodes=ppn=2 -I
+	# pbsnodes
+	#qstat
+	qstat 
+	#wait for output file to be generated and display
+	echo -ne "Waiting for Output."
+	until [ -f $script.o* ]; do
+		sleep 1
+		echo -ne "."
+		((timeout++))
+		if [ $timeout == 60 ]; then
+			echo "TimeOut 60 seconds: Job is still queued for execution, check for output file later (*.sh.o)"
+			break
+		fi
+	done
+	cat $script.o*
+	cat $script.e*
+fi
diff --git a/Libraries/oneDNN/tutorials/requirements.txt b/Libraries/oneDNN/tutorials/requirements.txt
new file mode 100644
index 0000000000..c68f08ce51
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/requirements.txt
@@ -0,0 +1,5 @@
+###### Requirements without Version Specifiers ######`
+pandas
+matplotlib
+psutil
+###### Requirements with Version Specifiers ######`
diff --git a/Libraries/oneDNN/tutorials/sample.json b/Libraries/oneDNN/tutorials/sample.json
new file mode 100644
index 0000000000..00ac1bb1da
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/sample.json
@@ -0,0 +1,44 @@
+{
+ "guid": "FC7A16DE-9594-4F40-AFA2-71ACABF366B3",
+ "name": "oneDNN Tutorials",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneDNN"],
+ "description": "oneDNN Tutorials.",
+ "toolchain": ["dpcpp"],
+ "languages": [{"cpp":{}}],
+ "dependencies": ["oneDNN", "tbb"],
+ "os": ["linux"],
+ "builder": ["ide","cmake"],
+ "targetDevice": ["CPU", "GPU"],
+ "ciTests": {
+	"linux": [
+	{
+		"env": ["source /opt/intel/oneapi/setvars.sh --dnnl-configuration=cpu_dpcpp_gpu_dpcpp --force" ],
+		"id": "dnn gsg",
+		"steps": [
+			"runipy getting_started.ipynb"
+		 ]
+	},
+	{
+		"env": ["source /opt/intel/oneapi/setvars.sh --dnnl-configuration=cpu_dpcpp_gpu_dpcpp --force" ],
+		"id": "simple_model",
+		"steps": [
+			"runipy simple_model.ipynb"
+		 ]
+	},
+	{
+		"env": ["source /opt/intel/oneapi/setvars.sh --dnnl-configuration=cpu_dpcpp_gpu_dpcpp --force" ],
+		"id": "verbose_jit",
+		"steps": [
+			"runipy verbose_jitdump.ipynb"
+		 ]
+	},
+	{
+		"env": ["source /opt/intel/oneapi/setvars.sh --dnnl-configuration=cpu_gomp --force" ],
+		"id": "isa",
+		"steps": [
+			"runipy analyze_isa_with_dispatcher_control.ipynb"
+		 ]
+	}
+     ]
+ }
+}
diff --git a/Libraries/oneDNN/tutorials/simple_model.ipynb b/Libraries/oneDNN/tutorials/simple_model.ipynb
new file mode 100644
index 0000000000..7a9e4333dc
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/simple_model.ipynb
@@ -0,0 +1,527 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 1.1 - port a Intel® oneAPI Deep Neural Network Library (oneDNN)  sample from CPU to GPU  - oneDNN CNN FP32 Inference\n",
+    "\n",
+    "## Learning Objectives\n",
+    "In this module the developer will:\n",
+    "* Learn how to port a oneDNN sample from a CPU-only version to a CPU&GPU version by using DPC++\n",
+    "* Learn how to program a simple convolutional neural network by using oneDNN\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# Exercise : Porting oneDNN application from CPU to GPU \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1 : introduce oneDNN configurations inside Intel® oneAPI toolkits\n",
+    "oneDNN has four different configurations inside the Intel oneAPI toolkits. Each configuration is in a different folder under the oneDNN installation path, and each configuration supports different compilers or threading libraries."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the installation path of your Intel oneAPI toolkit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ONEAPI_INSTALL=/opt/intel/oneapi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!printf '%s\\n'    $ONEAPI_INSTALL/oneDNN/latest/cpu_*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you can see, there are 4 different folders under the oneDNN installation path, and each of those configurations supports different features. This tutorial will make use of two configurations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First of all, create a lab folder for this exercise."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir lab;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Step 2 : scanning the cnn_inference_f32.cpp code which only supports CPU\n",
+    "\n",
+    "This C++ API example demonstrates how to build an AlexNet neural network topology for forward-pass inference, and it can run only on CPU.\n",
+    "You can find a detailed code explanation at this [link](https://oneapi-src.github.io/oneDNN/cnn_inference_f32_cpp.html)\n",
+    "\n",
+    "There is a cnn_inference_f32.cpp, which has a CPU-only implementation.\n",
+    "Let us copy into the lab folder, and use it as the base of the lab.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cp codes_for_ipynb/cnn_inference_f32.cpp lab/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The user could check the source file using the following command, but we recommened to use the detailed code explanation at this [link](https://oneapi-src.github.io/oneDNN/cnn_inference_f32_cpp.html) instead."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat lab/cnn_inference_f32.cpp "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, copy the required CMake file into the lab folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cp $ONEAPI_INSTALL/oneDNN/latest/cpu_gomp/examples/CMakeLists.txt lab/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step3:   Build and Execution\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Build and Run with GNU Compiler and OpenMP \n",
+    "For this CPU-only AlexNet neural network topology for forward-pass inference sample, the GNU compiler is used.\n",
+    "The following section guides you how to build with G++ and run on CPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler  command and flags that will generate the executable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_gomp  --force > /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir cpu_gomp\n",
+    "cd cpu_gomp\n",
+    "cmake .. -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DDNNL_CPU_RUNTIME=OMP -DDNNL_GPU_RUNTIME=NONE\n",
+    "make cnn-inference-f32-cpp\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the Intel DevCloud or in local environments.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "the script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "The user must switch to the G++ oneDNN configuration by inputting a custom configuration \"--dnnl-configuration=cpu_gomp\" when running \"source setvars.sh\".\n",
+    "\n",
+    "By default, oneDNN Verbose log is disabled.\n",
+    "You can unmark  #export DNNL_VERBOSE=1 to enable oneDNN verbose log."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --dnnl-configuration=cpu_gomp  --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "# unmark below line to enable oneDNN verbose log\n",
+    "#export DNNL_VERBOSE=1\n",
+    "./cpu_gomp/out/cnn-inference-f32-cpp\n",
+    "echo \"########## Done with the run\"\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit the **build.sh** and **run.sh** to the job queue.\n",
+    "\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts both on the DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails, it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "!rm -rf cpu_gomp; chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Enable oneDNN Verbose log and check the engine kind for each operation\n",
+    "cpu should be the engine kind for most of the operations, and you should be able to check the engine kind after \"dnnl_verbose,exec,\" for each operation.\n",
+    "Check this [link](https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html) for a detailed explanation of oneDNN verbose log.\n",
+    "\n",
+    "Below is an example for oneDNN verbose log for convolution on CPU:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "dnnl_verbose,exec,cpu,convolution,jit:avx2,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb8a:f0 bia_f32::blocked:a:f0 dst_f32::blocked:aBcd8b:f0,,alg:convolution_direct,mb1_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,0.458008"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Step 4 : Modifying the cnn_inference_f32.cpp code to support both CPU and GPU\n",
+    "\n",
+    "In this session, we will convert the above cnn_inference_f32.cpp to support both CPU and GPU and compile the sample with DPC++ instead of G++.\n",
+    "\n",
+    "There are three steps to do the code conversion from CPU to GPU for this sample.\n",
+    "\n",
+    "* Step 1 : change engine::kind from CPU to GPU\n",
+    "* Step 2 : implement a function to access GPU memory via SYCL buffer and its accessor\n",
+    "* Step 3 : write user's data into GPU memory via the implemented function from Step 2\n",
+    "\n",
+    "There is a cnn_inference_f32.patch file inside the src folder. It contains all the changes for porting CPU to GPU against the CPU-only version of cnn_inference_f32.cpp.\n",
+    "First we must patch the cnn_inference_f32.cpp under the lab folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd lab;patch < ../codes_for_ipynb/cnn_inference_f32.patch;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Users can check the source file using the following command."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat lab/cnn_inference_f32.cpp "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can find related modification in below cnn_inference_f32.cpp, and the modifications for each step are wrapped up with \">>>>>>\" and \"<<<<<<\".\n",
+    "\n",
+    "### step1 : change engine::kind from CPU to GPU\n",
+    "changing engine kind from cpu to gpu during engine instantiation.\n",
+    "* Before patching : engine eng(engine::kind::cpu, 0);\n",
+    "* After patching : engine eng(engine::kind::gpu, 0);\n",
+    "\n",
+    "### step 2 : implement a function to access GPU memory via SYCL buffer and its accessor\n",
+    "You can refer to the below function write_to_dnnl_memory for that.\n",
+    "overall, we use SYCL buffer and its accessor to access GPU memory.\n",
+    "auto buffer = mem.get_sycl_buffer<uint8_t>();\n",
+    "auto dst = buffer.get_access<cl::sycl::access::mode::write>();"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "+// ------ GPU code conversion --Step 2 >>>>>>\n",
+    "+// Read from handle, write to memory\n",
+    "+inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {\n",
+    "+\n",
+    "+    dnnl::engine eng = mem.get_engine();\n",
+    "+    size_t size = mem.get_desc().get_size();\n",
+    "+\n",
+    "+    bool is_cpu_sycl = (DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL\n",
+    "+            && eng.get_kind() == dnnl::engine::kind::cpu);\n",
+    "+    bool is_gpu_sycl = (DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL\n",
+    "+            && eng.get_kind() == dnnl::engine::kind::gpu);\n",
+    "+    if (is_cpu_sycl || is_gpu_sycl) {\n",
+    "+\n",
+    "+        auto buffer = mem.get_sycl_buffer<uint8_t>();\n",
+    "+        auto dst = buffer.get_access<cl::sycl::access::mode::write>();\n",
+    "+        uint8_t *dst_ptr = dst.get_pointer();\n",
+    "+\n",
+    "+        if (!dst_ptr || !handle) {\n",
+    "+            std::cerr << \"memory is NULL\"\n",
+    "+                      << \"\\n\";\n",
+    "+            return;\n",
+    "+        }\n",
+    "+        for (size_t i = 0; i < size; ++i)\n",
+    "+            dst_ptr[i] = ((uint8_t *)handle)[i];\n",
+    "+        return;\n",
+    "+    }\n",
+    "+\n",
+    "+    if (eng.get_kind() == dnnl::engine::kind::cpu) {\n",
+    "+        uint8_t *dst = static_cast<uint8_t *>(mem.get_data_handle());\n",
+    "+        if (!dst || !handle) {\n",
+    "+            std::cerr << \"memory is NULL\"\n",
+    "+                      << \"\\n\";\n",
+    "+            return;\n",
+    "+        }\n",
+    "+        for (size_t i = 0; i < size; ++i)\n",
+    "+            dst[i] = ((uint8_t *)handle)[i];\n",
+    "+        return;\n",
+    "+    }\n",
+    "+\n",
+    "+    assert(!\"not expected\");\n",
+    "+}\n",
+    "+//<<<<<< ------ GPU code conversion --Step 2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " #### Step 3 : write user's data into GPU memory via the implemented function from Step 2\n",
+    " For accessing user data in GPU memory, we can't use the host pointer to write data into that, but we use write_to_dnnl_memory function instead. Refer to the code snapshot below."
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "     auto user_src_memory = memory(\n",
+    "-            { { conv1_src_tz }, dt::f32, tag::nchw }, eng, user_src.data());\n",
+    "+            { { conv1_src_tz }, dt::f32, tag::nchw }, eng);\n",
+    "+    write_to_dnnl_memory(user_src.data(), user_src_memory);\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Build and Run with oneAPI DPC++ Compiler \n",
+    "For this  AlexNet neural network topology for forward-pass inference sample on GPU, DPC++ is used as the compiler.\n",
+    "The following section guides you how to build with DPC++ and run on GPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler  command and flags that will generate the exectuable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh  --dnnl-configuration=cpu_dpcpp_gpu_dpcpp --force > /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir dpcpp\n",
+    "cd dpcpp\n",
+    "cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp -DDNNL_CPU_RUNTIME=SYCL -DDNNL_GPU_RUNTIME=SYCL\n",
+    "make cnn-inference-f32-cpp\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the DevCloud or in local environments.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "the script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "\n",
+    "By default, oneDNN Verbose log is disabled.\n",
+    "You can unmark  #export DNNL_VERBOSE=1 to enable oneDNN verbose log."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh  --dnnl-configuration=cpu_dpcpp_gpu_dpcpp --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "#export DNNL_VERBOSE=1\n",
+    "./dpcpp/out/cnn-inference-f32-cpp gpu\n",
+    "echo \"########## Done with the run\"\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit the **build.sh** and **run.sh** to the job queue.\n",
+    "\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts both on the DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf dpcpp; chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Enable oneDNN Verbose log and check the engine kind for each operation\n",
+    "gpu should be the engine kind for most of the operations, and you should be able to check the engine kind after \"dnnl_verbose,exec,\" for each operation.\n",
+    "Check this [link](https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html) for a detailed explanation of oneDNN verbose log.\n",
+    "\n",
+    "Below is an example for oneDNN verbose log for convolution on GPU:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "dnnl_verbose,exec,gpu,convolution,ocl:gen9:blocked,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_f32::blocked:a:f0 dst_f32::blocked:aBcd16b:f0,,alg:convolution_direct,mb1_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# Summary\n",
+    "In this lab, the developer learned the following:\n",
+    "* How to port a oneDNN sample from CPU-only version to CPU&GPU version\n",
+    "* How to program a simple convolutional neural network by using oneDNN"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "525.6px",
+    "left": "28px",
+    "top": "137.8px",
+    "width": "301.109px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Libraries/oneDNN/tutorials/verbose_jitdump.ipynb b/Libraries/oneDNN/tutorials/verbose_jitdump.ipynb
new file mode 100644
index 0000000000..262f1d4d22
--- /dev/null
+++ b/Libraries/oneDNN/tutorials/verbose_jitdump.ipynb
@@ -0,0 +1,755 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Profile Intel® oneAPI Deep Neural Network Library (oneDNN) Samples by using Verobse Mode and JIT DUMP inspection\n",
+    "\n",
+    "## Learning Objectives\n",
+    "In this module the developer will:\n",
+    "* Learn how to use Verbose Mode to profile oneDNN samples on CPU & GPU\n",
+    "* Learn how to inspect JIT Dump to profile oneDNN samples on CPU"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This module shows the elapsed time percentage over different oneDNN primitives\n",
+    "<img src=\"images/cpu.JPG\" style=\"float:left\" width=600>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This module also shows the elapsed time percentage over different oneDNN JIT or GPU kernels\n",
+    "<img src=\"images/cpu_jit.JPG\" style=\"float:left\" width=400>\n",
+    "<img src=\"images/gpu_kernel.JPG\" style=\"float:right\" width=400>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# Verbose Mode Exercise\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## prerequisites\n",
+    "***\n",
+    "### Step 1: Prepare the build/run environment\n",
+    "oneDNN has four different configurations inside the Intel oneAPI toolkits. Each configuration is in a different folder under the oneDNN installation path, and each configuration supports a different compiler or threading library  \n",
+    "\n",
+    "Set the installation path of your oneAPI toolkit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# default path: /opt/intel/oneapi\n",
+    "%env ONEAPI_INSTALL=/opt/intel/oneapi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!printf '%s\\n'     $ONEAPI_INSTALL/dnnl/latest/cpu_*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you can see, there are four different folders under the oneDNN installation path, and each of those configurations supports different features. This tutorial will use the dpcpp configuration to showcase the verbose log for both CPU and GPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a lab folder for this exercise."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p lab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install required python packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get current platform information for this exercise."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from profiling.profile_utils import PlatformUtils\n",
+    "plat_utils = PlatformUtils()\n",
+    "plat_utils.dump_platform_info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###  Step 2: Preparing the samples code\n",
+    "\n",
+    "This exercise uses the cnn_inference_f32.cpp example from oneDNN installation path.\n",
+    "\n",
+    "The section below will copy the cnn_inference_f32.cpp file into the lab folder.  \n",
+    "This section also copies the required header files and CMake file into the lab folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_dpcpp_gpu_dpcpp/examples/cnn_inference_f32.cpp lab/\n",
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_dpcpp_gpu_dpcpp/examples/example_utils.hpp lab/\n",
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_dpcpp_gpu_dpcpp/examples/example_utils.h lab/\n",
+    "!cp $ONEAPI_INSTALL/dnnl/latest/cpu_dpcpp_gpu_dpcpp/examples/CMakeLists.txt lab/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 3: Build and Run with the oneAPI DPC++ Compiler \n",
+    "One of the oneDNN configurations supports the oneAPI DPC++ compiler, and it can run on different architectures by using DPC++.\n",
+    "The following section shows you how to build with DPC++ and run on different architectures."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Script - build.sh\n",
+    "The script **build.sh** encapsulates the compiler **dpcpp** command and flags that will generate the exectuable.\n",
+    "To enable use of the DPC++ compiler and the related SYCL runtime, some definitions must be passed as cmake arguments.\n",
+    "Here are the related cmake arguments for the DPC++ configuration: \n",
+    "\n",
+    "   -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp -DDNNL_CPU_RUNTIME=SYCL -DDNNL_GPU_RUNTIME=SYCL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile build.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --force> /dev/null 2>&1\n",
+    "export EXAMPLE_ROOT=./lab/\n",
+    "mkdir dpcpp\n",
+    "cd dpcpp\n",
+    "cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp -DDNNL_CPU_RUNTIME=SYCL -DDNNL_GPU_RUNTIME=SYCL\n",
+    "make cnn-inference-f32-cpp \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you achieve an all-clear from your compilation, you execute your program on the DevCloud or a local machine.\n",
+    "\n",
+    "#### Script - run.sh\n",
+    "The script **run.sh** encapsulates the program for submission to the job queue for execution.\n",
+    "By default, the built program uses CPU as the execution engine, but the user can switch to GPU by specifying the input argument \"gpu\".\n",
+    "The user can refer to run.sh below to run cnn-inference-f32-cpp on both CPU and GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "# enable verbose log\n",
+    "export DNNL_VERBOSE=0\n",
+    "./dpcpp/out/cnn-inference-f32-cpp cpu\n",
+    "./dpcpp/out/cnn-inference-f32-cpp gpu\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit **build.sh** and **run.sh** to the job queue.\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts either on the Intel DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails, it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "! rm -rf dpcpp;chmod 755 q; chmod 755 build.sh; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q build.sh; ./q run.sh; else ./build.sh; ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "  \n",
+    "## Enable Verbose Mode\n",
+    "***\n",
+    "In this section, we enable verbose mode on the built sample from the previous section, and users can see different results from CPU and GPU.  \n",
+    "Refer to the [link](https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html) for detailed verbose mode information"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When the feature is enabled at build-time, you can use the DNNL_VERBOSE environment variable to turn verbose mode on and control the level of verbosity.\n",
+    "\n",
+    "|Environment variable|Value|Description|\n",
+    "|:-----|:----|:-----|\n",
+    "|DNNL_VERBOSE| 0 |no verbose output (default)|\n",
+    "||1|primitive information at execution|\n",
+    "||2|primitive information at creation and execution|\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "prepare run.sh and enable DNNL_VERBOSE as 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "# enable verbose log\n",
+    "export DNNL_VERBOSE=2 \n",
+    "./dpcpp/out/cnn-inference-f32-cpp cpu >>log_cpu_f32_vb2.csv 2>&1\n",
+    "./dpcpp/out/cnn-inference-f32-cpp gpu >>log_gpu_f32_vb2.csv 2>&1\n",
+    "\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Submitting **build.sh** and **run.sh** to the job queue\n",
+    "Now we can submit **build.sh** and **run.sh** to the job queue.\n",
+    "##### NOTE - it is possible to execute any of the build and run commands in local environments.\n",
+    "To enable users to run their scripts either on the Intel DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command **qsub**.  If the check fails, it is assumed that build/run will be local."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "! chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q run.sh; else ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analyze Verbose Logs\n",
+    "***\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 1: List out all oneDNN verbose logs\n",
+    "users should see two verbose logs listed in the table below.\n",
+    "\n",
+    "|Log File Name | Description |\n",
+    "|:-----|:----|\n",
+    "|log_cpu_f32_vb2.csv| log for cpu run |\n",
+    "|log_cpu_f32_vb2.csv| log for gpu run|"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "filenames= os.listdir (\".\") \n",
+    "result = []\n",
+    "keyword = \".csv\"\n",
+    "for filename in filenames: \n",
+    "    #if os.path.isdir(os.path.join(os.path.abspath(\".\"), filename)): \n",
+    "    if filename.find(keyword) != -1:\n",
+    "        result.append(filename)\n",
+    "result.sort()\n",
+    "\n",
+    "index =0 \n",
+    "for folder in result:\n",
+    "    print(\" %d : %s \" %(index, folder))\n",
+    "    index+=1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 2:  Pick a verbose log by putting its index value below\n",
+    "Users can pick either cpu or gpu log for analysis.   \n",
+    "Once users finish Step 2 to Step 8 for one log file, they can go back to step 2 and select another log file for analysis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FdIndex=0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### OPTIONAL: browse the content of selected verbose log."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logfile = result[FdIndex]\n",
+    "with open(logfile) as f:\n",
+    "    log = f.read()\n",
+    "print(log)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 3: Parse verbose log and get the data back"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logfile = result[FdIndex]\n",
+    "print(logfile)\n",
+    "from profiling.profile_utils import oneDNNUtils, oneDNNLog\n",
+    "onednn = oneDNNUtils()\n",
+    "log1 = oneDNNLog()\n",
+    "log1.load_log(logfile)\n",
+    "data = log1.data\n",
+    "exec_data = log1.exec_data\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 4: Time breakdown for exec type\n",
+    "The exec type includes exec and create. \n",
+    "\n",
+    "|exec type | Description |  \n",
+    "|:-----|:----|  \n",
+    "|exec | Time for primitives exection. Better to spend most of time on primitives execution. |  \n",
+    "|create| Time for primitives creation. Primitives creation happens once. Better to spend less time on primitive creation. |  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onednn.breakdown(data,\"exec\",\"time\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 5: Time breakdown for primitives type\n",
+    "The primitives type includes convolution, reorder, sum, etc.  \n",
+    "For this simple convolution net example, convolution and inner product primitives are expected to spend most of time.  \n",
+    "However, the exact time percentage of different primitivies may vary among different architectures.    \n",
+    "Users can easily identify top hotpots of primitives executions with this time breakdown.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onednn.breakdown(exec_data,\"type\",\"time\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 6:  Time breakdown for JIT kernel type"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "oneDNN uses just-in-time compilation (JIT) to generate optimal code for some functions based on input parameters and instruction set supported by the system.   \n",
+    "Therefore, users can see different JIT kernel type among different CPU and GPU architectures.  \n",
+    "For example, users can see avx_core_vnni JIT kernel if the workload uses VNNI instruction on Cascake Lake platform.  \n",
+    "Users can also see different OCL kernels among different Intel GPU generations.  \n",
+    "Moreover, users can identify the top hotspots of JIT kernel executions with this time breakdown.  \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onednn.breakdown(exec_data,\"jit\",\"time\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 7:  Time breakdown for algorithm type\n",
+    "oneDNN also supports different algorithms.  \n",
+    "Users can identify the top hotspots of algorthm executions with this time breakdown.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onednn.breakdown(exec_data,\"alg\",\"time\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 8: Time breakdown for architecture type\n",
+    "The supported architectures include CPU and GPU.  \n",
+    "For this simple net sample, we don't split computation among CPU and GPU,    \n",
+    "so users should see either 100% CPU time or 100% GPU time. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onednn.breakdown(data,\"arch\",\"time\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "## Inspecting JIT Code\n",
+    "\n",
+    "In this section, we dump JIT code  on the built sample from the previous section, and users can see different results from CPU.    \n",
+    "Refer to the [link](https://oneapi-src.github.io/oneDNN/dev_guide_inspecting_jit.html) for detailed JIT Dump information"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When the feature is enabled at build-time, you can use the DNNL_JIT_DUMP environment variable to inspect JIT code.\n",
+    "\n",
+    "|Environment variable|Value|Description|\n",
+    "|:-----|:----|:-----|\n",
+    "|DNNL_JIT_DUMP | 0 |JIT dump is disabled (default)|\n",
+    "||any other value|JIT dump is enabled|\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Step 1: Prepare run.sh and enable DNNL_JIT_DUMP as 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile run.sh\n",
+    "#!/bin/bash\n",
+    "source $ONEAPI_INSTALL/setvars.sh --force > /dev/null 2>&1\n",
+    "echo \"########## Executing the run\"\n",
+    "# disable verbose log\n",
+    "export DNNL_VERBOSE=0\n",
+    "# enable JIT Dump\n",
+    "export DNNL_JIT_DUMP=1 \n",
+    "./dpcpp/out/cnn-inference-f32-cpp cpu\n",
+    "echo \"########## Done with the run\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "#### Step 2: Submitting ***run.sh** to the job queue\n",
+    "Now we can submit **run.sh** to the job queue."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "! chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ./q run.sh; else ./run.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Step 3: Move all JIT Dump files into the jitdump folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir jitdump;mv *.bin jitdump"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Step 4: List out all oneDNN JIT Dump files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "filenames= os.listdir (\"jitdump\") \n",
+    "result = []\n",
+    "keyword = \".bin\"\n",
+    "for filename in filenames: \n",
+    "    #if os.path.isdir(os.path.join(os.path.abspath(\".\"), filename)): \n",
+    "    if filename.find(keyword) != -1:\n",
+    "        result.append(filename)\n",
+    "result.sort()\n",
+    "\n",
+    "index =0 \n",
+    "for folder in result:\n",
+    "    print(\" %d : %s \" %(index, folder))\n",
+    "    index+=1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Step 5: Pick a JIT Dump file by putting its index value below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FdIndex=2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Step 6: export JIT Dump file to environment variable JITFILE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logfile = result[FdIndex]\n",
+    "os.environ[\"JITFILE\"] = logfile"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Step 7: disassembler JIT Dump file to view the code\n",
+    "\n",
+    "> NOTE: If the oneDNN sample uses VNNI instruction, users should be able to see \"vpdpbusd\" instruction from the JIT Dump file  \n",
+    "\n",
+    "> NOTE: If the oneDNN sample uses BF16 instruction, users should see usage of vdpbf16ps or vcvtne2ps2bf16 in the JIT dump file.  \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> NOTE: For disassembler vdpbf16ps and vcvtne2ps2bf16 instructions, users must use objdump with v2.34 or above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!objdump -D -b binary -mi386:x86-64 jitdump/$JITFILE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***\n",
+    "# Summary\n",
+    "In this lab the developer learned the following:\n",
+    "* how to use Verbose Mode to profile different oneDNN samples on CPU and GPU\n",
+    "* how to inspect JIT Dump to profile oneDNN samples on CPU\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "525.6px",
+    "left": "28px",
+    "top": "137.8px",
+    "width": "301.109px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}