diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/CMakeLists.txt
new file mode 100755
index 0000000000..1066f774ea
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/CMakeLists.txt
@@ -0,0 +1,20 @@
+if(UNIX)
+ # Direct CMake to use dpcpp rather than the default C++ compiler/linker
+ set(CMAKE_CXX_COMPILER dpcpp)
+else() # Windows
+ # Force CMake to use dpcpp rather than the default C++ compiler/linker
+ # (needed on Windows only)
+ include (CMakeForceCompiler)
+ CMAKE_FORCE_CXX_COMPILER (dpcpp IntelDPCPP)
+ include (Platform/Windows-Clang)
+endif()
+
+cmake_minimum_required (VERSION 3.4)
+
+project(ANR CXX)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/License.txt
new file mode 100755
index 0000000000..7c8b8a36c6
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/License.txt
@@ -0,0 +1,23 @@
+Copyright Intel Corporation
+
+SPDX-License-Identifier: MIT
+https://opensource.org/licenses/MIT
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/README.md
new file mode 100755
index 0000000000..f3be849cea
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/README.md
@@ -0,0 +1,208 @@
+# Adaptive Noise Reduction (ANR)
+This DPC++ reference design demonstrates a highly optimized image sensor adaptive noise reduction (ANR) algorithm on an FPGA.
+
+***Documentation***:
+* [DPC++ FPGA Code Samples Guide](https://software.intel.com/content/www/us/en/develop/articles/explore-dpcpp-through-intel-fpga-code-samples.html) helps you to navigate the samples and build your knowledge of DPC++ for FPGA.
+* [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) is the reference manual for targeting FPGAs through DPC++.
+* [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming.
+
+| Optimized for | Description
+--- |---
+| OS | Ubuntu* 18.04/20.04, RHEL*/CentOS* 8, SUSE* 15; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA Intel® FPGA Programmable Acceleration Card (PAC) D5005 (with Intel Stratix® 10 SX) Intel Xeon® CPU E5-1650 v2 @ 3.50GHz (host machine)
+| Software | Intel® oneAPI DPC++ Compiler Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | How to create a parameterizable image processing pipeline to implement an Adaptive Noise Reduction (ANR) algorithm on an FPGA.
+| Time to complete | 1 hour
+
+## Purpose
+This FPGA reference design demonstrates a parameterizable image processing pipeline that implements an Adaptive Noise Reduction (ANR) algorithm using a bilateral filter. See the [Additional Design Information Section](#additional-design-information) for more information on the ANR algorithm itself and how it was implemented for the FPGA.
+
+## License
+Code samples are licensed under the MIT license. See
+[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.
+
+## Building the Reference Design
+
+### Include Files
+The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Code Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the type of compute node and whether to run in batch or interactive mode. Compiles to FPGA are only supported on fpga_compile nodes. Executing programs on FPGA hardware is only supported on fpga_runtime nodes of the appropriate type, such as fpga_runtime:arria10 or fpga_runtime:stratix10. Neither compiling nor executing programs on FPGA hardware are supported on the login nodes. For more information, see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/documentation/base-toolkit/](https://devcloud.intel.com/oneapi/documentation/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h.
+
+### On a Linux* System
+1. Install the design into a directory `build` from the design directory by running `cmake`:
+
+ ```
+ mkdir build
+ cd build
+ ```
+
+ If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+
+ ```
+ cmake ..
+ ```
+
+ If instead you are compiling for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following targets are provided, and they match the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device).
+
+ ```
+ make fpga_emu
+ ```
+
+ * Generate HTML performance report. Find the report in `anr_report.prj/reports/report.html`directory.
+
+ ```
+ make report
+ ```
+
+ * Compile for FPGA hardware (longer compile time, targets FPGA device).
+
+ ```
+ make fpga
+ ```
+
+3. (Optional) As the above hardware compile may take several hours to complete, FPGA precompiled binaries (compatible with Linux* Ubuntu* 18.04) can be downloaded here.
+
+### On a Windows* System
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake -G "NMake Makefiles" ..
+ ```
+ Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command:
+ ```
+ cmake -G "NMake Makefiles" .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ nmake fpga_emu
+ ```
+ * Generate the optimization report:
+ ```
+ nmake report
+ ```
+ * An FPGA hardware target is not provided on Windows*.
+
+*Note:* The Intel® PAC with Intel Arria® 10 GX FPGA and Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) do not yet support Windows*. Compiling to FPGA hardware on Windows* requires a third-party or custom Board Support Package (BSP) with Windows* support.
+
+### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this Reference Design in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Running the Reference Design
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU).
+ ```
+ ./anr.fpga_emu (Linux)
+ anr.fpga_emu.exe (Windows)
+ ```
+
+2. Run the sample on the FPGA device.
+ ```
+ ./anr.fpga (Linux)
+ ```
+
+### Example of Output
+You should see output similar to the following in the console:
+```
+Runs: 2
+Columns: 1920
+Rows: 1436
+Frames: 8
+Filter Size: 9
+Pixels Per Cycle: 2
+Maximum Columns: 2048
+
+Execution time: 45.0012 ms
+Throughput: 488.876 MB/s
+PASSED
+```
+NOTE: When running on the FPGA emulator, the *Execution time* and *Throughput* do not reflect the design's actual hardware performance.
+
+
+## Additional Design Information
+### Source Code Breakdown
+The following source files can be found in the `src/` sub-directory.
+
+| File | Description
+|:--- |:---
+|`main.cpp` | Contains the `main()` function and the top-level launching, validation, and performance measurements.
+|`anr_params.hpp` | A class for parsing and holding the ANR specific parameters, such as the sigma coefficients, the filter size, and the alpha value.
+|`anr.hpp` | Contains the logic for submitting all of the ANR kernels to the queue and the top-level description of the bilateral filter (both horizontal and vertical).
+|`column_stencil.hpp` | A generic library for computing a column stencil (a 1D vertical convolution).
+|`constants.hpp` | Contains the constants and datatypes for the ANR algorithm.
+|`data_bundle.hpp` | A generic library for bundling data to move between kernels; essentially an array.
+|`dma_kernels.hpp` | Contains kernels that move data between the host and device, as well as reading/writing data between the FPGA and device memory.
+|`intensity_sigma_lut.hpp` | A RAM LUT for the intensity sigma values.
+|`mp_math.hpp` | Metaprogramming math helper functions.
+|`qfp_exp_lut.hpp` | A ROM LUT for computing exp(-x) on a 32-bit floating-point value (using a QFP).
+|`qfp_inv_lut.hpp` | A ROM LUT for computing 1/x on a 32-bit floating-point value (using a QFP).
+|`qfp.hpp` | Contains a class with generic static methods for converting between 32-bit floating-point and quantized floating-point (QFP).
+|`rom_base.hpp` | A generic library for creating a ROM from a `constexpr` class.
+|`row_stencil.hpp` | A generic library for computing a row stencil (a 1D horizontal convolution).
+|`shift_reg.hpp` | A generic library for a shift register.
+|`unrolled_loop.hpp` | A templated-based loop unroller that unrolls loops in the compiler front end.
+
+### ANR Algorithm
+The ANR algorithm works on an input image that is in [Bayer format](https://en.wikipedia.org/wiki/Bayer_filter). Unlike image formats you may be used to (e.g., PNG or JPG), where each pixel has a red, green, **and** blue value (RGB), each pixel in a Bayer format image is either red, green, **or** blue, as shown in the image below. To convert to an RGB image, you take a 4x4 square and generate the RGB pixel by averaging the two green pixels. One purpose of this format is to dedicate more pixels to green, since the human eye is more sensitive to green.
+
+
+
+The ANR algorithm uses a [bilateral filter](https://en.wikipedia.org/wiki/Bilateral_filter). Unilateral filters (e.g., a [Box blur](https://en.wikipedia.org/wiki/Box_blur) or [Gaussian blur](https://en.wikipedia.org/wiki/Gaussian_blur)) replace the intensity of a given pixel with a weighted average of the neighbouring pixels, where the weight of each neighouring pixel depends on the spatial distance from the pixel being computed. With bilateral filters, like the one used in this design, the weight of each neighbouring pixel depends on both the spatial distance, and the difference in pixel intensity. This makes bilateral filters much better at preserving sharp edges.
+
+Bilateral filters are non-linear and therefore non-separable. Note that in the case of a 5x5 window (shown below), only 9 pixels (not 25) are used in the computation; this is an artifact of the Bayer image format. The most accurate approach would produce the bilateral filter window and convolve the entire window of the given pixel colour (for the image below, red) at once to generate the output pixel (the middle pixel). For the 5x5 case, this would result in 9 multiplications that need to be summed.
+
+
+
+This produces a long chain of adders to sum the results of the multiplications. In this design, we approximate the bilateral filter by making it separable. We first apply a 1D vertical filter to the middle pixel, and then a 1D horizontal filter, as shown in the image below. This reduces the number of multiplications that need to be summed together to 3. In our design, we apply the vertical filter to *all* pixels first, and then apply the horizontal filter, which results in the *corner* pixels indirectly applying some weight to the middle pixel.
+
+
+
+### ANR FPGA Design
+The ANR algorithm is designed as a streaming kernel system with input pixels streaming through the input pipe, and the denoised output pixels streaming out the output pipe, as shown in the figure below. The design consists of two kernels, `Vertical Kernel` and `Horizontal Kernel`, that are connected by an internal SYCL pipe, as shown in the figure below. The `Vertical Kernel` computes an intensity sigma value based on the current pixel, computes the bilateral filter, and applies it to the current window to produce an intermediate pixel value. The `Vertical Kernel` kernel sends three values through the internal pipe: the original pixel value, the current pixel value (i.e., intermediate pixel that was just computed), and the intensity sigma value. The `Horizontal Kernel` streams in these tuples and performs a similar computation but on a horizontal window. It uses the forwarded intensity sigma value to compute the bilateral filer, the new pixel values to perform the bilateral filter computation, and the original pixel to perform *alpha blending*, where the output pixel is a weighted percentage of the original pixel value and the denoised pixel value.
+
+
+
+To compute a given pixel, the `Vertical Kernel` must store previous rows (i.e., lines) of the input image. The technique to do so is shown in the image below. The pixels are streamed in from the pipe and used with pixels from previous rows to perform the 1D vertical window operation.
+
+
+
+The logic for the `Horizontal Kernel`, shown below, is much simpler since it operates on a single row at a time.
+
+
+
+To produce the input data and consume the output, we setup a full system as shown in the figure below. The `Input Kernel` reads input data from device memory and provides it to the ANR design via the input pipe. The `Output Kernel` reads the ANR design's output from the output pipe and writes it to device memory. The oneAPI host code then uses the output data to validate the accuracy of the ANR algorithm against a golden result using the [Peak signal-to-noise ratio (PSNR)](https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio).
+
+
+
+### Quantized Floating-Point (QFP)
+Floating-point values consist of a sign bit, an exponent, and a mantissa. In this design, we take [32-bit single-precision](https://en.wikipedia.org/wiki/Single-precision_floating-point_format) floating values and convert them to quantized floating-point (QFP) values which use less bits. All of the QFPs in this design have 10 bits total, but use a different number for the exponent and mantissa. The purpose of this conversion is to be able to create lookup-table (LUT) read-only memories (ROMs) to approximate expensive 32-bit floating-point operations like an exponential (`exp(x)`) and inversion (`1/x`). Creating LUT ROMs for 32-bit floats would require `2^32*4 = 17GB` bytes of on-chip memory. However, if the float can be *quantized* to 10 bits, it requires only `2^10*4 = 4KB` of on-chip memory, at the expense of reduced precision.
+
+### Reusable Header Files
+In this design, we use the following generic header files:
+ - `ColumnStencil` (*column_stencil.hpp*): A library for generalizing a column stencil (i.e., the vertical filter) using C++ functors for callbacks to perform the filter. This library hides the details of the FIFO line stores and padding logic and allows the user to simply worry about the filter convolution.
+ - `DataBundle` (*data_bundle.hpp*): A library for holding multiple pieces of the same data. This class is similar to a C++ `std::array`, but ensures that the constructors and `operator=` are overriden properly to avoid expensive loops.
+ - `ROMBase` (*rom_base.hpp*): This library provides a base class for creating a `constexpr` class that results in a ROM in the FPGA.
+ - `RowStencil` (*row_stencil.hpp*): A library for generalizing a row stencil (i.e., the horizontal filter) using C++ functors for callbacks to perform the filter. This library hides the details of the shift register and padding logic and allows the user to simply worry about the filter convolution.
+ - `ShiftReg` (*shift_reg.hpp*): A library to implement a shift register. This hides the logic necessary to ensure the compiler infers an efficient shift register behind easy-to-use API calls.
+ - `UnrolledLoop` (*unrolled_loop.hpp*): A library that implements a front-end unrolled loop using C++ metaprogramming.
+ - *mp_math.hpp*: A set of various `constexpr` math functions that are implemented using C++ metaprogramming.
+
+ For more information on the usage and implementation of these header libraries, view the source code (the `.hpp` files), which are well commented for documentation.
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr.sln
new file mode 100755
index 0000000000..9f51a9168d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "anr", "anr.vcxproj", "{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.ActiveCfg = Debug|x64
+ {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.Build.0 = Debug|x64
+ {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.ActiveCfg = Release|x64
+ {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {97D1BD74-AAAB-4835-8F00-37A58B70871A}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr.vcxproj
new file mode 100755
index 0000000000..aa084c4102
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr.vcxproj
@@ -0,0 +1,180 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 15.0
+ {acde6b7a-6f9a-428e-b040-cedc5b1e2c79}
+ Win32Proj
+ qrd
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR -fconstexpr-steps=5084968 %(AdditionalOptions)
+
+
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR -fconstexpr-steps=5084968 %(AdditionalOptions)
+
+
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr_ip.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr_ip.png
new file mode 100755
index 0000000000..ef0b161642
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr_ip.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr_system.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr_system.png
new file mode 100755
index 0000000000..973993db11
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/anr_system.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/bayer.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/bayer.png
new file mode 100755
index 0000000000..af7ecabc2e
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/bayer.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/bilateral_estimate.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/bilateral_estimate.png
new file mode 100755
index 0000000000..fac14a0730
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/bilateral_estimate.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/conv.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/conv.png
new file mode 100755
index 0000000000..cb2957f202
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/conv.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/horizontal_kernel.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/horizontal_kernel.png
new file mode 100755
index 0000000000..87eb68c3e8
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/horizontal_kernel.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/sample.json
new file mode 100755
index 0000000000..b16a34bc62
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/sample.json
@@ -0,0 +1,55 @@
+{
+ "guid": "B2974A99-4BC6-4F3E-AD3E-C17FE86AC2A4",
+ "name": "Adaptive Noise Reduction",
+ "categories": ["Toolkit/oneAPI Direct Programming/DPC++ FPGA/Reference Designs"],
+ "description": "A highly optimized adaptive noise reduction (ANR) algorithm on an FPGA.",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "builder": ["ide", "cmake"],
+ "targetDevice": ["FPGA"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./anr.fpga_emu"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake -G \"NMake Makefiles\" ..",
+ "nmake fpga_emu",
+ "anr.fpga_emu.exe"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake -G \"NMake Makefiles\" ..",
+ "nmake report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/CMakeLists.txt
new file mode 100644
index 0000000000..46852583f2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/CMakeLists.txt
@@ -0,0 +1,144 @@
+set(TARGET_NAME anr)
+set(SOURCE_FILE main.cpp)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# FPGA board selection
+if(NOT DEFINED FPGA_BOARD)
+ set(FPGA_BOARD "intel_a10gx_pac:pac_a10")
+ message(STATUS "FPGA_BOARD was not specified.\
+ \nConfiguring the design to run on the default FPGA board ${FPGA_BOARD} (Intel(R) PAC with Intel Arria(R) 10 GX FPGA). \
+ \nPlease refer to the README for information on board selection.")
+else()
+ message(STATUS "Configuring the design to run on FPGA board ${FPGA_BOARD}")
+endif()
+
+# This is a Windows-specific flag that enables error handling in host code
+if(WIN32)
+ set(WIN_FLAG "/EHsc")
+endif()
+
+# Allow the user to enable hardware profiling
+# Profiling can be enabled when running cmake by adding the flag -DPROFILE_HW=1
+# e.g. cmake .. -DPROFILE_HW=1
+if(PROFILE_HW)
+ set(PROFILE_FLAG "-Xsprofile")
+endif()
+
+# Allow the user to do a flat compile
+# Profiling can be enabled when running cmake by adding the flag -DFLAT_COMPILE=1
+# e.g. cmake .. -DFLAT_COMPILE=1
+if(FLAT_COMPILE)
+ set(FLAT_COMPILE_FLAG "-Xsbsp-flow=flat")
+endif()
+
+# Choose the random seed for the hardware compile
+# e.g. cmake .. -DSEED=7
+if(NOT DEFINED SEED)
+ # the default seed
+ if(FPGA_BOARD MATCHES ".*a10.*")
+ set(SEED 1)
+ elseif(FPGA_BOARD MATCHES ".*s10.*")
+ set(SEED 2)
+ elseif(FPGA_BOARD MATCHES ".*agilex.*")
+ set(SEED 3)
+ else()
+ set(SEED 4)
+ endif()
+else()
+ message(STATUS "Seed explicitly set to ${SEED}")
+endif()
+
+# Allow the user to change the filter size
+# e.g. cmake .. -DFILTER_SIZE=9
+if(FILTER_SIZE)
+ set(FILTER_SIZE_FLAG "-DFILTER_SIZE=${FILTER_SIZE}")
+ message(STATUS "FILTER_SIZE explicitly set to ${FILTER_SIZE}")
+endif()
+
+# Allow the user to change the pixels per cycle
+# e.g. cmake .. -DPIXELS_PER_CYCLE=9
+if(PIXELS_PER_CYCLE)
+ set(PIXELS_PER_CYCLE_FLAG "-DPIXELS_PER_CYCLE=${PIXELS_PER_CYCLE}")
+ message(STATUS "PIXELS_PER_CYCLE explicitly set to ${PIXELS_PER_CYCLE}")
+else()
+ # Default PIXELS_PER_CYCLE based on the board being used
+ if(FPGA_BOARD MATCHES ".*a10.*")
+ set(PIXELS_PER_CYCLE 2)
+ elseif(FPGA_BOARD MATCHES ".*s10.*")
+ set(PIXELS_PER_CYCLE 2)
+ elseif(FPGA_BOARD MATCHES ".*agilex.*")
+ set(PIXELS_PER_CYCLE 1)
+ else()
+ message(WARNING "Unknown board: setting PIXELS_PER_CYCLE to 1")
+ set(PIXELS_PER_CYCLE 1)
+ endif()
+endif()
+set(PIXELS_PER_CYCLE_FLAG "-DPIXELS_PER_CYCLE=${PIXELS_PER_CYCLE}")
+
+# Allow the user to change the maximum number of pixels per column
+# e.g. cmake .. -DMAX_COLS=3840
+if(MAX_COLS)
+ set(MAX_COLS_FLAG "-DMAX_COLS=${MAX_COLS}")
+ message(STATUS "MAX_COLS explicitly set to ${MAX_COLS}")
+endif()
+
+# Allow the user to change the bitwidth of the pixels
+# e.g. cmake .. -DPIXEL_BITS=8
+if(PIXEL_BITS)
+ set(PIXEL_BITS_FLAG "-DPIXEL_BITS=${PIXEL_BITS}")
+ message(STATUS "PIXEL_BITS explicitly set to ${PIXEL_BITS}")
+endif()
+
+# Increase the allowable constexpr steps for the front end. This allows the
+# front-end compiler to do more compile-time computation.
+set(CONSTEXPR_STEPS "-fconstexpr-steps=5084968")
+
+# Print out configured variables
+message(STATUS " SEED=${SEED}")
+message(STATUS " PIXELS_PER_CYCLE=${PIXELS_PER_CYCLE}")
+if(FILTER_SIZE)
+ message(STATUS " FILTER_SIZE=${FILTER_SIZE}")
+endif()
+if(PIXEL_BITS)
+ message(STATUS " PIXEL_BITS=${PIXEL_BITS}")
+endif()
+
+# A DPC++ ahead-of-time (AoT) compile processes the device code in two stages.
+# 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V).
+# 2. The "link" stage invokes the compiler's FPGA backend before linking.
+# For this reason, FPGA backend flags must be passed as link flags in CMake.
+set(EMULATOR_COMPILE_FLAGS "-Wall ${CONSTEXPR_STEPS} ${WIN_FLAG} -fintelfpga ${FILTER_SIZE_FLAG} ${PIXELS_PER_CYCLE_FLAG} ${MAX_COLS_FLAG} ${PIXEL_BITS_FLAG} -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga ${FILTER_SIZE_FLAG} ${PIXELS_PER_CYCLE_FLAG} ${MAX_COLS_FLAG} ${PIXEL_BITS_FLAG}")
+set(HARDWARE_COMPILE_FLAGS "-Wall ${CONSTEXPR_STEPS} ${WIN_FLAG} -fintelfpga ${FILTER_SIZE_FLAG} ${PIXELS_PER_CYCLE_FLAG} ${MAX_COLS_FLAG} ${PIXEL_BITS_FLAG}")
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware ${PROFILE_FLAG} ${FLAT_COMPILE_FLAG} -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${FPGA_BOARD} ${FILTER_SIZE_FLAG} ${PIXELS_PER_CYCLE_FLAG} ${MAX_COLS_FLAG} ${PIXEL_BITS_FLAG} ${IP_MODE_FLAG} ${USER_HARDWARE_FLAGS}")
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+
+###############################################################################
+### FPGA Emulator
+###############################################################################
+add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS "${EMULATOR_COMPILE_FLAGS}")
+set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS "${EMULATOR_LINK_FLAGS}")
+add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+
+###############################################################################
+### Generate Report
+###############################################################################
+set(FPGA_EARLY_IMAGE ${TARGET_NAME}_report.a)
+# The compile output is not an executable, but an intermediate compilation result unique to DPC++.
+add_executable(${FPGA_EARLY_IMAGE} ${SOURCE_FILE})
+add_custom_target(report DEPENDS ${FPGA_EARLY_IMAGE})
+set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}")
+set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS} -fsycl-link=early")
+# fsycl-link=early stops the compiler after RTL generation, before invoking Quartus
+
+###############################################################################
+### FPGA Hardware
+###############################################################################
+add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}")
+set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_TARGET}")
+# The -reuse-exe flag enables rapid recompilation of host-only code changes.
+# See DPC++FPGA/GettingStarted/fast_recompile for details.
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/anr.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/anr.hpp
new file mode 100644
index 0000000000..b05b670122
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/anr.hpp
@@ -0,0 +1,403 @@
+#ifndef __ANR_HPP__
+#define __ANR_HPP__
+
+//
+// This file contains a bulk of the functionality for the ANR design on the
+// on the device. It contains the logic to submit the various kernels for the
+// ANR pipeline.
+//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "anr_params.hpp"
+#include "column_stencil.hpp"
+#include "constants.hpp"
+#include "data_bundle.hpp"
+#include "intensity_sigma_lut.hpp"
+#include "mp_math.hpp"
+#include "qfp.hpp"
+#include "qfp_exp_lut.hpp"
+#include "qfp_inv_lut.hpp"
+#include "row_stencil.hpp"
+#include "shift_reg.hpp"
+#include "unrolled_loop.hpp"
+
+using namespace sycl;
+
+// declare the kernel and pipe names globally to reduce name mangling
+class IntraPipeID;
+class VerticalKernelID;
+class HorizontalKernelID;
+
+//
+// A struct to carry the new (i.e., current) pixel, the original pixel, and the
+// intensity sigma from the vertical to horizontal kernel
+//
+struct DataForwardStruct {
+ DataForwardStruct() {}
+ DataForwardStruct(PixelT pixel_n) : pixel_n(pixel_n) {}
+ DataForwardStruct(PixelT pixel_n, PixelT pixel_o, float sig_i)
+ : pixel_n(pixel_n), pixel_o(pixel_o), sig_i(sig_i) {}
+
+ PixelT pixel_n; // the new pixel
+ PixelT pixel_o; // the original pixel
+ float sig_i; // the intensity sigma value for the pixel
+};
+
+//
+// Build the power values for a 1D gaussian filter. The 'actual' gaussian values
+// are exp(-x) where 'x' are the 'powers' in the 'filter'.
+//
+template
+auto BuildGaussianPowers1D(float sigma) {
+ ShiftReg filter;
+ for (int x = -size / 2; x <= size / 2; x++) {
+ float x_over_sig = x / sigma;
+ filter[x + size / 2] = 0.5 * x_over_sig * x_over_sig; // 0.5*(x/sigma)^2
+ }
+ return filter;
+}
+
+//
+// Given a float value, convert it to an unsigned pixel value by saturating it
+// in the extremes (i.e., min/max).
+//
+PixelT Saturate(float pixel_float) {
+ constexpr unsigned kMaxPixelVal = std::numeric_limits::max();
+ constexpr unsigned kMinPixelVal = std::numeric_limits::min();
+ constexpr unsigned kFloatSignOffset = ((sizeof(float) * 8) - 1);
+
+ // get the bits of the float for the negative check
+ unsigned int pixel_float_bits = reinterpret_cast(pixel_float);
+
+ PixelT pixel;
+ if (pixel_float >= kMaxPixelVal) {
+ pixel = kMaxPixelVal;
+ } else if ((pixel_float_bits >> kFloatSignOffset) & 0x1) { // pixel_float < 0
+ pixel = kMinPixelVal;
+ } else {
+ pixel = pixel_float;
+ }
+ return pixel;
+}
+
+//
+// Computes the 1D bilateral filter. The spatial filter is passed as an
+// argument and the intensity filter is computed based on the pixel window
+// ('buffer') and the ANR parameters ('params'). Together, the spatial and
+// intensity filters create the bilateral filter.
+//
+template
+inline float BilateralFilter1D(ShiftReg& buffer,
+ ShiftReg& spatial_power,
+ ANRParams params, float sig_i_inv_squared_x_half,
+ const ExpLUT& exp_lut,
+ const InvLUT& inv_lut) {
+ // We need to hold all pixels bits, but need the type to be signed for the
+ // BilateralFilter1D since it subtracts the values. So add one bit to it and
+ // make it a signed ac_int.
+ using SignedPixelT = ac_int;
+
+ // the middle pixel index
+ constexpr int mid_idx = filter_size / 2;
+
+ // static asserts
+ static_assert(filter_size > 1);
+
+ // convert unsigned pixels to signed
+ ShiftReg buffer_signed;
+ UnrolledLoop([&](auto i) {
+ buffer_signed[i] = static_cast(buffer[i]);
+ });
+
+ // build the bilateral filter
+ ShiftReg bilateral_filter;
+ float filter_sum = 0.0;
+
+ UnrolledLoop([&](auto i) {
+ // get the absolute value of the pixel differences
+ float intensity_diff_squared;
+ if constexpr (mid_idx == (i * 2)) {
+ // special case for middle pixel, the absolute difference will be 0
+ // and therefore the absolute value difference squared will also be 0
+ intensity_diff_squared = 0.0f;
+ } else {
+ // compute differences squared
+ const SignedPixelT intensity_diff =
+ buffer_signed[mid_idx] - buffer_signed[i * 2];
+ intensity_diff_squared = intensity_diff * intensity_diff;
+ }
+
+ // compute the filter value as e^-(intensity_component + spatial_component)
+ // Use a LUT to compute the exp(-x) value
+ float filter_val;
+ if constexpr (mid_idx == (i * 2)) {
+ // (buffer[mid_idx] - buffer[i * 2]) = 0 in this case, so the intensity
+ // component is 0. For similar reasons, the spatial component is also 0
+ // and therefore filter_val is e^(-0) = 1.
+ filter_val = 1.0f;
+ } else {
+ // intensity_component = 1/2 * (intensity_diff/sig_i)^2
+ // = 1/2*(1/sig_i)^2 *(intensity_diff)^2
+ // We have precomputed 1/2*(1/sig_i)^2 in the host
+ const float intensity_component =
+ intensity_diff_squared * sig_i_inv_squared_x_half;
+
+ // spatial component is a regular Gaussian that was precomputed using
+ // the 'BuildGaussianPowers1D' function
+ const float spatial_component = spatial_power[i];
+
+ // the bilateral filter power value, where the actual bilateral filter
+ // value is e^-(exp_power)
+ const float exp_power = intensity_component + spatial_component;
+
+ // now that we have the exponential power value ('exp_power'), use the
+ // exponential LUT ('ExpLUT') to lookup the result of exp(-exp_power).
+ // NOTE: when creating the exponential LUT, we stored the values of
+ // exp(-x) = 1/exp(x). This avoids negating the value of 'exp_power'.
+ const auto exp_lut_idx = ExpLUT::QFP::FromFP32(exp_power);
+ filter_val = ExpLUT::QFP::ToFP32(exp_lut[exp_lut_idx]);
+ }
+
+ // compute the bilateral filter value
+ bilateral_filter[i] = filter_val;
+ filter_sum += filter_val;
+ });
+
+ // Convolve the 1D bilateral filter with the pixel window
+ float filtered_pixel = 0.0;
+ UnrolledLoop([&](auto i) {
+ filtered_pixel += (float(buffer[i * 2]) * bilateral_filter[i]);
+ });
+
+ // Normalize the pixel value by the bilateral filter sum. Use the inverse
+ // LUT to compute 1/filter_sum. This saves area by using a 32-bit
+ // floating-point multiplication, instead of division.
+ // Computes: filtered_pixel /= filter_sum
+ const auto inv_lut_idx = InvLUT::QFP::FromFP32(filter_sum);
+ filtered_pixel *= InvLUT::QFP::ToFP32(inv_lut[inv_lut_idx]);
+
+ return filtered_pixel;
+}
+
+//
+// Functor for the column stencil callback.
+// This performs the 1D vertical bilateral filter. It also computes the
+// intensity sigma value (sig_i) and bundles it with the original and partially
+// filtered pixel to be forwarded to the horizontal kernel.
+//
+template
+struct VerticalFunctor {
+ auto operator()(int row, int col, ShiftReg buffer,
+ ShiftReg spatial_power,
+ ANRParams params, const ExpLUT& exp_lut,
+ const InvLUT& inv_lut, IntensitySigmaLUT& sig_i_lut) const {
+ // static asserts to validate template arguments
+ static_assert(filter_size > 1);
+
+ // get the middle index and compute the intensity sigma from it
+ constexpr int mid_idx = filter_size / 2;
+ const PixelT middle_pixel = buffer[mid_idx];
+ const auto sig_i_inv_squared_x_half = sig_i_lut[middle_pixel];
+
+ // perform the vertical 1D bilateral filter
+ auto output_pixel_float = BilateralFilter1D(
+ buffer, spatial_power, params, sig_i_inv_squared_x_half,
+ exp_lut, inv_lut);
+
+ // saturate the output pixel
+ PixelT output_pixel = Saturate(output_pixel_float);
+
+ // return the result, which is the output pixel, as well as the intensity
+ // sigma value (sig_i) and the original pixel, which are forwarded to the
+ // horizontal kernel for the horizontal 1D bilateral calculation and alpha
+ // blending, respectively.
+ return DataForwardStruct(output_pixel, middle_pixel,
+ sig_i_inv_squared_x_half);
+ }
+};
+
+//
+// Functor for the row stencil callback.
+// This performs the 1D horizontal bilateral filter. It uses the intensity
+// sigma (sig_i) that was forwarded from the vertical kernel.
+//
+template
+struct HorizontalFunctor {
+ auto operator()(int row, int col,
+ ShiftReg buffer,
+ ShiftReg spatial_power,
+ ANRParams params, ANRParams::AlphaFixedT alpha_fixed,
+ ANRParams::AlphaFixedT one_minus_alpha_fixed,
+ const ExpLUT& exp_lut, const InvLUT& inv_lut) const {
+ // static asserts
+ static_assert(filter_size > 1);
+
+ // grab the intensity sigma for the middle pixel (forwarded from the
+ // vertical kernel)
+ constexpr int mid_idx = filter_size / 2;
+ const float sig_i_inv_squared_x_half = buffer[mid_idx].sig_i;
+
+ // grab just the pixel data, pixel_n is the 'new' pixel forwarded from the
+ // vertical kernel (i.e., the partially filtered one)
+ ShiftReg buffer_pixels;
+ UnrolledLoop([&](auto i) {
+ buffer_pixels[i] = buffer[i].pixel_n;
+ });
+
+ // perform the horizontal 1D bilateral filter
+ auto output_pixel_float = BilateralFilter1D(
+ buffer_pixels, spatial_power, params, sig_i_inv_squared_x_half,
+ exp_lut, inv_lut);
+
+ // saturate the output pixel
+ PixelT output_pixel = Saturate(output_pixel_float);
+
+ // fixed-point alpha blending with the original pixel
+ const PixelT original_pixel(buffer[mid_idx].pixel_o);
+ auto output_pixel_alpha =
+ (alpha_fixed * output_pixel) + (one_minus_alpha_fixed * original_pixel);
+ auto output_pixel_tmp = output_pixel_alpha.to_ac_int();
+
+ // return the result casted back to a pixel
+ return PixelT(output_pixel_tmp);
+ }
+};
+
+//
+// Submit all of the ANR kernels (vertical and horizontal)
+//
+template
+std::vector SubmitANRKernels(queue& q, int cols, int rows,
+ ANRParams params,
+ float* sig_i_lut_data_ptr) {
+ // the internal pipe between the vertical and horizontal kernels
+ using IntraPipeT = DataBundle;
+ using IntraPipe = ext::intel::pipe;
+
+ // static asserts to validate template arguments
+ static_assert(filter_size > 1);
+ static_assert(max_cols > 1);
+ static_assert(pixels_per_cycle > 0);
+ static_assert(IsPow2(pixels_per_cycle));
+ static_assert(max_cols > pixels_per_cycle);
+ static_assert(std::is_integral_v);
+
+ // validate the function arguments
+ int padded_cols = PadColumns(cols);
+ if (cols > max_cols) {
+ std::cerr << "ERROR: cols exceeds the maximum (max_cols) "
+ << "(" << cols << " > " << max_cols << ")\n";
+ std::terminate();
+ } else if (cols <= 0) {
+ std::cerr << "ERROR: cols must be strictly positive\n";
+ std::terminate();
+ } else if (rows <= 0) {
+ std::cerr << "ERROR: rows must be strictly positive\n";
+ std::terminate();
+ } else if ((cols % pixels_per_cycle) != 0) {
+ std::cerr << "ERROR: the number of columns (" << cols
+ << ") must be a multiple of the number of pixels per cycle ("
+ << pixels_per_cycle << ")\n";
+ std::terminate();
+ } else if ((padded_cols % pixels_per_cycle) != 0) {
+ std::cerr << "ERROR: the number of padded columns (" << padded_cols
+ << ") must be a multiple of the number of pixels per cycle ("
+ << pixels_per_cycle << ")\n";
+ std::terminate();
+ } else if (padded_cols >= std::numeric_limits::max()) {
+ // padded_cols >= cols, so just check padded_cols
+ std::cerr << "ERROR: the number of padded columns (" << padded_cols
+ << ") is too big to be counted to by the IndexType (max="
+ << std::numeric_limits::max() << ")\n";
+ std::terminate();
+ } else if (rows >= std::numeric_limits::max()) {
+ std::cerr << "ERROR: the number of rows (" << rows
+ << ") is too big to be counted to by the IndexType (max="
+ << std::numeric_limits::max() << ")\n";
+ std::terminate();
+ }
+
+ // cast the rows and columns to the index type and use these
+ // variables inside the kernel to avoid the device dealing with conversions
+ const IndexT cols_k(cols);
+ const IndexT rows_k(rows);
+
+ // create the spatial filter for the stencil operation
+ constexpr int filter_size_eff = (filter_size + 1) / 2; // ceil(filter_size/2)
+ auto spatial_power = BuildGaussianPowers1D(params.sig_s);
+
+ // Functors or lamdas can be used for the vertical and horizontal kernels.
+ auto vertical_func = VerticalFunctor();
+ auto horizontal_func = HorizontalFunctor();
+
+ // submit the vertical kernel using a column stencil
+ auto vertical_kernel = q.submit([&](handler& h) {
+ h.single_task([=] {
+ // copy host side intensity sigma LUT to the device
+ // For testing the kernel system as an IP and checking the area and Fmax,
+ // we allow the user to turn off connections to device memory. In this case
+ // (the DISABLE_DEVICE_MEM macro IS defined), the results will be incorrect
+ // since there is no way to get the data to/from the device.
+#if defined(IP_MODE)
+ IntensitySigmaLUT sig_i_lut;
+#else
+ IntensitySigmaLUT sig_i_lut(sig_i_lut_data_ptr);
+#endif
+
+ // build the constexpr exp() and inverse LUT ROMs
+ constexpr ExpLUT exp_lut;
+ constexpr InvLUT inv_lut;
+
+ // Start the column stencil.
+ // It will callback to 'vertical_func' with all of the additional
+ // arguments listed after 'vertical_func' (i.e., spatial_power,
+ // params, ...)
+ ColumnStencil(rows_k,
+ cols_k, PixelT(0), vertical_func, spatial_power, params,
+ std::cref(exp_lut), std::cref(inv_lut),
+ std::ref(sig_i_lut));
+ });
+ });
+
+ // submit the horizontal kernel using a row stencil
+ auto horizontal_kernel = q.submit([&](handler& h) {
+ h.single_task([=] {
+ // build the constexpr exp() and inverse LUT ROMs
+ constexpr ExpLUT exp_lut;
+ constexpr InvLUT inv_lut;
+
+#ifdef IP_MODE
+ ANRParams::AlphaFixedT alpha_fixed(0.75);
+ ANRParams::AlphaFixedT one_minus_alpha_fixed(0.25);
+#else
+ // convert the alpha and (1-alpha) values to fixed-point
+ ANRParams::AlphaFixedT alpha_fixed(params.alpha);
+ ANRParams::AlphaFixedT one_minus_alpha_fixed(params.one_minus_alpha);
+#endif
+
+ // Start the row stencil.
+ // It will callback to 'horizontal_func' with the additional all of the
+ // additional arguments listed after 'horizontal_func' (i.e.,
+ // spatial_power, params, alpha_fixed, ...)
+ RowStencil(rows_k, cols_k,
+ DataForwardStruct(0), horizontal_func, spatial_power,
+ params, alpha_fixed, one_minus_alpha_fixed, std::cref(exp_lut),
+ std::cref(inv_lut));
+ });
+ });
+
+ return {vertical_kernel, horizontal_kernel};
+}
+
+#endif /* __ANR_HPP__ */
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/anr_params.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/anr_params.hpp
new file mode 100644
index 0000000000..08cf6f5cef
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/anr_params.hpp
@@ -0,0 +1,99 @@
+#ifndef __ANR_PARAMS_HPP__
+#define __ANR_PARAMS_HPP__
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+//
+// A struct to hold the ANR configuration paremeters
+//
+struct ANRParams {
+ // the floating point format
+ using FloatT = float;
+
+ // the alpha blending computation uses fixed point format, these constants
+ // hold the total number of bits and the number of integer bits (the number
+ // of fractional bits is the difference between the two)
+ static constexpr int kAlphaTotalBits = 9;
+ static constexpr int kAlphaIntegerBits = 1;
+
+ // the ac_fixed type for the alpha value
+ using AlphaFixedT = ac_fixed;
+
+ // default constructor
+ ANRParams() {}
+
+ // static method to parse the ANRParams from a file
+ static ANRParams FromFile(std::string filename) {
+ // the return object
+ ANRParams ret;
+
+ // create the file stream to parse
+ std::ifstream is(filename);
+
+ // make sure we opened the file fine
+ if (!is.is_open() || is.fail()) {
+ std::cerr << "ERROR: failed to open " << filename << " for reading\n";
+ std::terminate();
+ }
+
+ // parse the lines
+ std::string line;
+ while (std::getline(is, line)) {
+ size_t colon_pos = line.find(':');
+ auto name = line.substr(0, colon_pos);
+ auto val = std::stod(line.substr(colon_pos + 1));
+
+ if (name == "sig_shot") {
+ ret.sig_shot = val;
+ ret.sig_shot_2 = val * val;
+ } else if (name == "k") {
+ ret.k = val;
+ } else if (name == "sig_i_coeff") {
+ ret.sig_i_coeff = val;
+ } else if (name == "sig_s") {
+ ret.sig_s = val;
+ } else if (name == "alpha") {
+ ret.alpha = val;
+ ret.one_minus_alpha = 1 - val;
+ } else if (name == "filter_size") {
+ ret.filter_size = val;
+ } else if (name == "pixel_bits") {
+ ret.pixel_bits = val;
+ } else {
+ std::cerr << "WARNING: unknown name " << name
+ << " in ANRParams constructor\n";
+ }
+ }
+
+ return ret;
+ }
+
+ int filter_size; // filter size
+ FloatT sig_shot; // shot noise
+ FloatT k; // total gain
+ FloatT sig_i_coeff; // intensity sigma coefficient
+ FloatT sig_s; // spatial sigma
+ FloatT alpha; // alpha value for alpha blending
+ int pixel_bits; // the number of bits for each pixel
+
+ // precomputed values
+ FloatT sig_shot_2; // shot noise squared
+ FloatT one_minus_alpha; // 1 - alpha
+};
+
+// convenience method for printing the ANRParams
+std::ostream& operator<<(std::ostream& os, const ANRParams& params) {
+ os << "sig_shot: " << params.sig_shot << "\n";
+ os << "k: " << params.k << "\n";
+ os << "sig_i_coeff: " << params.sig_i_coeff << "\n";
+ os << "sig_s: " << params.sig_s << "\n";
+ os << "alpha: " << params.alpha << "\n";
+ return os;
+}
+
+#endif /* __ANR_PARAMS_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/column_stencil.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/column_stencil.hpp
new file mode 100644
index 0000000000..5b712b0782
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/column_stencil.hpp
@@ -0,0 +1,212 @@
+#ifndef __COLUMN_STENCIL_HPP__
+#define __COLUMN_STENCIL_HPP__
+
+#include
+#include
+
+#include "data_bundle.hpp"
+#include "mp_math.hpp"
+#include "shift_reg.hpp"
+#include "unrolled_loop.hpp"
+
+using namespace sycl;
+using namespace hldutils;
+
+//
+// Generic 1D column (i.e. vertical) stencil.
+//
+// TEMPLATE PARAMETERS
+// InType: The input pixel type. This is read in by the row stencil
+// through a SYCL pipe. The pipe should be hold
+// 'parallel_cols' elements of this type using the
+// 'DataBundle' type (DataBundle).
+// OutType: The output pixel type. The same logic as the InType above.
+// The data written to the output type is
+// DataBundle
+// IndexT: The datatype used for indexing. This type should have
+// enough bits to count up to the number or rows and columns.
+// InPipe: The input pipe to stream in 'parallel_cols' 'InT' values.
+// OutPipe: The output pipe to stream out 'parallel_cols' 'OutT'
+// values.
+// filter_size: The filter size (i.e., the number of pixels to convolve).
+// max_cols: The maximum number of columns in the image. The runtime
+// argument 'cols' chooses the actual number of columns, and
+// it must be less than or equal to 'max_cols'. Changing
+// 'max_cols' changes the area necessary for the IP, since
+// it sets the size of the FIFOs for the line stores.
+// parallel_cols: The number of columns to compute in parallel.
+// StencilFunction: The stencil callback functor, provided by the user, which
+// is called for every pixel to perform the actual
+// convolution. The function definition should be as follows:
+//
+// OutT MyStencilFunction(int, int, ShiftReg,
+// FunctionArgTypes...)
+//
+// The user can provide extra arguments to the callback by
+// using the FunctionArgTypes parameter pack.
+// FunctionArgTypes: The user-provided type parameter pack of the arguments to
+// pass to the callback function.
+//
+//
+// FUNCTION ARGUMENTS
+// rows: The number of rows in the image.
+// cols: The number of columns in the image.
+// computed by the IP is rows*cols.
+// zero_val: The 'zero' value for the stencil. This is used to pad
+// the columns of the image.
+// func: The user-defined functor. This is a callback that is called
+// to perform the 1D convolution.
+// stencil_args...: The parameter pack of arguments to be passed to the
+// user-defined callback functor.
+//
+template
+void ColumnStencil(IndexT rows, IndexT cols, const InType zero_val,
+ StencilFunction func, FunctionArgTypes... stencil_args) {
+ // types coming into and out of the kernel from pipes, respectively
+ using InPipeT = DataBundle;
+ using OutPipeT = DataBundle;
+
+ // constexpr
+ constexpr int kPaddingPixels = filter_size / 2;
+ constexpr int kShiftRegCols = 1 + parallel_cols - 1;
+ constexpr int kShiftRegRows = filter_size;
+ constexpr int kLineBufferFIFODepth =
+ (max_cols / parallel_cols) + /*filter_size*/ 1;
+ constexpr int kNumLineBuffers = filter_size - 1;
+ constexpr IndexT kColThreshLow = kPaddingPixels;
+ constexpr IndexT kRowThreshLow = kPaddingPixels;
+ constexpr IndexT kRowOutputThreshLow = 2 * kPaddingPixels;
+
+ // static asserts to validate template arguments
+ static_assert(filter_size > 1);
+ static_assert(max_cols > parallel_cols);
+ static_assert(parallel_cols > 0);
+ static_assert(IsPow2(parallel_cols));
+ static_assert(std::is_invocable_r_v,
+ FunctionArgTypes...>);
+
+ // constants
+ const IndexT row_thresh_high = kPaddingPixels + rows;
+ const IndexT padded_rows = rows + 2 * kRowThreshLow;
+ const IndexT fifo_wrap =
+ (cols + /*filter_size*/ 1 - 1 + (parallel_cols - 1 /*round up*/)) /
+ parallel_cols;
+ const IndexT col_loop_bound = (cols / parallel_cols);
+
+ // the 2D shift register to store the 'kShiftRegCols' columns of size
+ // 'kShiftRegRows'
+ ShiftReg2d shifty_2d;
+
+ // the line buffer fifo
+ [[intel::fpga_memory]]
+ InPipeT line_buffer_FIFO[kLineBufferFIFODepth][kNumLineBuffers];
+
+ InPipeT last_new_pixels(zero_val);
+
+ IndexT fifo_idx = 0; // track top of FIFO
+
+ // the main processing loop for the image
+ // NOTE: speculated iterations here will cause a bubble, but
+ // small number relative padded_rows * col_loop_bound and the
+ // increase in Fmax justifies it.
+ [[intel::loop_coalesce(2), intel::initiation_interval(1),
+ intel::ivdep(line_buffer_FIFO)]]
+ for (IndexT row = 0; row < padded_rows; row++) {
+ [[intel::initiation_interval(1), intel::ivdep(line_buffer_FIFO)]]
+ for (IndexT col_loop = 0; col_loop < col_loop_bound; col_loop++) {
+ // the base column index for this iteration
+ IndexT col = col_loop * parallel_cols;
+
+ // read in values if it is time to start reading
+ // (row >= kRowThreshLow) and if there are still more to read
+ // (row < row_thresh_high)
+ InPipeT new_pixels(zero_val);
+ if ((row >= kRowThreshLow) && (row < row_thresh_high)) {
+ new_pixels = InPipe::read();
+ }
+
+ InPipeT input_val(last_new_pixels);
+ constexpr auto kInputShiftVals =
+ Min(kColThreshLow, (IndexT)parallel_cols);
+ input_val.template ShiftMultiVals(
+ new_pixels);
+
+ [[intel::fpga_register]]
+ InPipeT pixel_column[filter_size];
+
+ // load from FIFO to shift register
+ //
+ // ┌───────────
+ // ┌───┬───┬───┐ ┌───┤ FIFO
+ // │ ◄─ ◄─ ◄─┘ └───────────
+ // ├───┼───┼───┤ ┌───────────
+ // │ ◄─ ◄─ ◄─────┤ FIFO
+ // ├───┼───┼───┤ └───────────
+ // │ ◄─ ◄─ ◄─────────────────Input
+ // └───┴───┴───┘
+
+ UnrolledLoop<0, filter_size>([&](auto stencil_row) {
+ if constexpr (stencil_row != (filter_size - 1)) {
+ pixel_column[stencil_row] = line_buffer_FIFO[fifo_idx][stencil_row];
+ } else {
+ pixel_column[stencil_row] = input_val;
+ }
+ });
+ shifty_2d.template ShiftCols(pixel_column);
+
+ // Continue processing through FIFOs
+ // ┌─────────────┐
+ // │ FIFO ◄───┐
+ // └─────────────┘ │
+ // ┌───────────────────┘
+ // │ ┌─────────────┐
+ // └─┤ FIFO ◄───┐
+ // └─────────────┘ │
+ // └─Input
+
+ UnrolledLoop<0, (filter_size - 1)>([&](auto fifo_row) {
+ if constexpr (fifo_row != (filter_size - 2)) {
+ line_buffer_FIFO[fifo_idx][fifo_row] = pixel_column[fifo_row + 1];
+ } else {
+ line_buffer_FIFO[fifo_idx][(filter_size - 2)] = input_val;
+ }
+ });
+
+ // Perform the convolution on the 1D window
+ OutPipeT out_data((OutType)0);
+ UnrolledLoop<0, parallel_cols>([&](auto stencil_idx) {
+ ShiftReg shifty_copy;
+
+ int col_local = col + stencil_idx;
+
+ UnrolledLoop<0, filter_size>([&](auto stencil_row) {
+ shifty_copy[stencil_row] = shifty_2d[stencil_row][stencil_idx];
+ });
+
+ // pass a copy of the line buffer's register window.
+ out_data[stencil_idx] = func((row - kRowOutputThreshLow), col_local,
+ shifty_copy, stencil_args...);
+ });
+
+ // write the output data if it is in range (i.e., it is a real pixel
+ // and not part of the padding)
+ if (row >= kRowOutputThreshLow) {
+ OutPipe::write(out_data);
+ }
+
+ // increment the fifo read/write index
+ if (fifo_idx == (fifo_wrap - 1)) {
+ fifo_idx = 0;
+ } else {
+ fifo_idx++;
+ }
+ last_new_pixels = new_pixels;
+ }
+ }
+}
+
+#endif /* __COLUMN_STENCIL_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/constants.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/constants.hpp
new file mode 100755
index 0000000000..2729c592ab
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/constants.hpp
@@ -0,0 +1,87 @@
+#ifndef __CONSTANTS_HPP__
+#define __CONSTANTS_HPP__
+
+#include
+#include "mp_math.hpp"
+
+// The size of the filter can be changed at the command line
+#ifndef FILTER_SIZE
+#define FILTER_SIZE 9
+#endif
+constexpr unsigned kFilterSize = FILTER_SIZE;
+static_assert(kFilterSize > 1);
+
+// The number of pixels per cycle
+#ifndef PIXELS_PER_CYCLE
+#define PIXELS_PER_CYCLE 1
+#endif
+constexpr unsigned kPixelsPerCycle = PIXELS_PER_CYCLE;
+static_assert(kPixelsPerCycle > 0);
+static_assert(IsPow2(kPixelsPerCycle) > 0);
+
+// The maximum number of columns in the image
+#ifndef MAX_COLS
+#define MAX_COLS 1920 // HD
+//#define MAX_COLS 3840 // 4K
+//#define MAX_COLS 2048
+#endif
+constexpr unsigned kMaxCols = MAX_COLS;
+static_assert(kMaxCols > 0);
+static_assert(kMaxCols > kPixelsPerCycle);
+
+// The maximum number of rows in the image
+#ifndef MAX_ROWS
+#define MAX_ROWS MAX_COLS
+#endif
+constexpr unsigned kMaxRows = MAX_ROWS;
+static_assert(kMaxRows > 0);
+
+// pick the indexing variable size based on kMaxCols and kMaxRows
+constexpr unsigned kSmallIndexTBits =
+ Max(CeilLog2(kMaxCols), CeilLog2(kMaxRows));
+using SmallIndexT = ac_int;
+
+// add max() function to std::numeric_limits for IndexT
+namespace std {
+ template<> class numeric_limits {
+ public:
+ static constexpr int max() { return (1 << kSmallIndexTBits) - 1; };
+ static constexpr int min() { return 0; };
+ };
+};
+
+// the type used for indexing the rows and columns of the image
+using IndexT = short;
+static_assert(std::is_integral_v);
+static_assert(!std::is_unsigned_v);
+
+// the number of bits used for the pixel
+#ifndef PIXEL_BITS
+#define PIXEL_BITS 8
+#endif
+constexpr unsigned kPixelBits = PIXEL_BITS;
+static_assert(kPixelBits > 0);
+
+// the type to use for the pixel intensity values and a temporary type
+// which should have more bits than the pixel type to check for overflow.
+// We will use subtraction on the temporary type, so it must be signed.
+using PixelT = ac_int; // 'kPixelBits' bits, unsigned
+using TmpT = long long; // 64 bits, signed
+constexpr int kPixelRange = (1 << kPixelBits);
+static_assert(std::is_signed_v);
+static_assert((sizeof(TmpT) * 8) > kPixelBits);
+
+// add min() and max() functions to std::numeric_limits for PixelT
+namespace std {
+ template<> class numeric_limits {
+ public:
+ static constexpr int max() { return (1 << kPixelBits) - 1; };
+ static constexpr int min() { return 0; };
+ };
+};
+
+// PSRN default threshold
+// https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+constexpr double kPSNRDefaultThreshold = 30.0;
+
+#endif /* __CONSTANTS_HPP__ */
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/data_bundle.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/data_bundle.hpp
new file mode 100644
index 0000000000..90cbf283af
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/data_bundle.hpp
@@ -0,0 +1,101 @@
+#ifndef __DATA_BUNDLE_HPP__
+#define __DATA_BUNDLE_HPP__
+
+namespace hldutils {
+
+//
+// A class used to group together 'bundle_size' elements of type 'T' into a
+// struct. Similar to an array but with the copyer constructor and operator=()
+// overridden to avoid expensive copys.
+//
+template
+struct DataBundle {
+ T data_[bundle_size];
+
+ DataBundle() {}
+
+ DataBundle(const T op) {
+#pragma unroll
+ for (int idx = 0; idx < bundle_size; idx++) {
+ data_[idx] = op;
+ }
+ }
+
+ DataBundle(const DataBundle &op) {
+#pragma unroll
+ for (int idx = 0; idx < bundle_size; idx++) {
+ data_[idx] = op.data_[idx];
+ }
+ }
+
+ DataBundle &operator=(const DataBundle &op) {
+#pragma unroll
+ for (int idx = 0; idx < bundle_size; idx++) {
+ data_[idx] = op.data_[idx];
+ }
+ return *this;
+ }
+
+ bool operator==(const DataBundle &rhs) {
+ bool is_equal = true;
+#pragma unroll
+ for (int b = 0; b < bundle_size; b++) {
+ is_equal &= (data_[b] == rhs.data_[b]);
+ }
+
+ return is_equal;
+ }
+
+ // get a specific value in the bundle
+ T &operator[](int i) { return data_[i]; }
+
+ // get a raw pointer to underlying data
+ T *Data() { return &data_[0]; }
+
+ // For a shift register with N columns, the first piece of data is inserted in
+ // index [N-1], and is read out of index [0].
+ //
+ // ```
+ // i=0 1 2
+ // ┌───┬───┬───┐
+ // out ◄─ │ r ◄─e ◄─g ◄─ input
+ // └───┴───┴───┘
+ // ```
+ void Shift(T &in) {
+#pragma unroll
+ for (int i = 0; i < (bundle_size - 1); i++) {
+ data_[i] = data_[i + 1];
+ }
+ data_[bundle_size - 1] = in;
+ }
+
+ template
+ void ShiftSingleVal(T &in) {
+#pragma unroll
+ for (int i = 0; i < (bundle_size - shift_amt); i++) {
+ data_[i] = data_[i + shift_amt];
+ }
+
+#pragma unroll
+ for (int i = 0; i < (shift_amt); i++) {
+ data_[(bundle_size - shift_amt) + i] = in;
+ }
+ }
+
+ template
+ void ShiftMultiVals(DataBundle &in) {
+#pragma unroll
+ for (int i = 0; i < (bundle_size - shift_amt); i++) {
+ data_[i] = data_[i + shift_amt];
+ }
+
+#pragma unroll
+ for (int i = 0; i < (shift_amt); i++) {
+ data_[(bundle_size - shift_amt) + i] = in[i];
+ }
+ }
+};
+
+} // namespace hldutils
+
+#endif /* __DATA_BUNDLE_HPP__ */
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/dma_kernels.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/dma_kernels.hpp
new file mode 100644
index 0000000000..38cb16f6f6
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/dma_kernels.hpp
@@ -0,0 +1,101 @@
+#ifndef __DMA_KERNELS_HPP__
+#define __DMA_KERNELS_HPP__
+
+//
+// This file contains the kernels for reading from device memory and
+// streaming into the ANR input pipe, as well as the kernels for reading from
+// the ANR output pipe and writing to device memory.
+//
+
+#include
+#include
+
+#include "data_bundle.hpp"
+
+using namespace sycl;
+using namespace hldutils;
+
+//
+// Kernel to read data from device memory and write it into the ANR input pipe.
+//
+template
+event SubmitInputDMA(queue &q, T *in_ptr, int rows, int cols, int frames) {
+ using PipeType = DataBundle;
+
+ // LSU attribute to turn off caching
+ using NonCachingLSU =
+ ext::intel::lsu, ext::intel::cache<0>,
+ ext::intel::statically_coalesce,
+ ext::intel::prefetch>;
+
+ // validate the number of columns
+ if ((cols % pixels_per_cycle) != 0) {
+ std::cerr << "ERROR: the number of columns is not a multiple of the pixels "
+ << "per cycle\n";
+ std::terminate();
+ }
+
+ // the number of iterations is the number of total pixels (rows*cols)
+ // divided by the number of pixels per cycle
+ const int iterations = cols * rows / pixels_per_cycle;
+
+ // Using device memory
+ return q.submit([&](handler &h) {
+ h.single_task([=]() [[intel::kernel_args_restrict]] {
+ device_ptr in(in_ptr);
+
+ // coalesce the following two loops into a single for-loop using the
+ // loop_coalesce attribute
+ [[intel::loop_coalesce(2)]]
+ for (int f = 0; f < frames; f++) {
+ for (int i = 0; i < iterations; i++) {
+ PipeType pipe_data;
+ #pragma unroll
+ for (int k = 0; k < pixels_per_cycle; k++) {
+ pipe_data[k] = NonCachingLSU::load(in + i * pixels_per_cycle + k);
+ }
+ Pipe::write(pipe_data);
+ }
+ }
+ });
+ });
+}
+
+//
+// Kernel to pull data out of the ANR output pipe and writes to device memory.
+//
+template
+event SubmitOutputDMA(queue &q, T *out_ptr, int rows, int cols, int frames) {
+ // validate the number of columns
+ if ((cols % pixels_per_cycle) != 0) {
+ std::cerr << "ERROR: the number of columns is not a multiple of the pixels "
+ << "per cycle\n";
+ std::terminate();
+ }
+
+ // the number of iterations is the number of total pixels (rows*cols)
+ // divided by the number of pixels per cycle
+ const int iterations = cols * rows / pixels_per_cycle;
+
+ // Using device memory
+ return q.submit([&](handler &h) {
+ h.single_task([=]() [[intel::kernel_args_restrict]] {
+ device_ptr out(out_ptr);
+
+ // coalesce the following two loops into a single for-loop using the
+ // loop_coalesce attribute
+ [[intel::loop_coalesce(2)]]
+ for (int f = 0; f < frames; f++) {
+ for (int i = 0; i < iterations; i++) {
+ auto pipe_data = Pipe::read();
+ #pragma unroll
+ for (int k = 0; k < pixels_per_cycle; k++) {
+ out[i * pixels_per_cycle + k] = pipe_data[k];
+ }
+ }
+ }
+ });
+ });
+}
+
+#endif /* __DMA_KERNELS_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/intensity_sigma_lut.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/intensity_sigma_lut.hpp
new file mode 100644
index 0000000000..0057ceb5aa
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/intensity_sigma_lut.hpp
@@ -0,0 +1,64 @@
+#ifndef __INTENSITY_SIGMA_LUT_HPP__
+#define __INTENSITY_SIGMA_LUT_HPP__
+
+#include
+#include
+#include
+
+#include "anr_params.hpp"
+#include "constants.hpp"
+
+//
+// A LUT for computing the intensity sigma value of a pixel
+//
+class IntensitySigmaLUT {
+ public:
+ // default constructor
+ IntensitySigmaLUT() {}
+
+ // construct from a device_ptr (for constructing from device memory)
+ IntensitySigmaLUT(device_ptr ptr) {
+ // use a pipelined LSU to load from device memory since we don't
+ // care about the performance of the copy.
+ using PipelinedLSU = ext::intel::lsu<>;
+ for (int i = 0; i < lut_depth; i++) {
+ data_[i] = PipelinedLSU::load(ptr + i);
+ }
+ }
+
+ // construct from the ANR parameters (actually builds the LUT)
+ IntensitySigmaLUT(ANRParams params) {
+ for (int i = 0; i < lut_depth; i++) {
+ float sig_i = sycl::sqrt(params.k * float(i) + params.sig_shot_2) *
+ params.sig_i_coeff;
+ float sig_i_inv = 1.0f / sig_i;
+ float sig_i_inv_squared = sig_i_inv * sig_i_inv;
+ float sig_i_inv_squared_2 = 0.5f * sig_i_inv_squared;
+ data_[i] = sig_i_inv_squared_2; // storing 0.5 * (1/sig_i)^2
+ }
+ }
+
+ // helper static method to allocate enough memory to hold the LUT
+ static float* AllocateDevice(sycl::queue& q) {
+ float* ptr = sycl::malloc_device(lut_depth, q);
+ if (ptr == nullptr) {
+ std::cerr << "ERROR: could not allocate space for 'ptr'\n";
+ std::terminate();
+ }
+ return ptr;
+ }
+
+ // helper method to copy the data to the device
+ sycl::event CopyDataToDevice(sycl::queue& q, float* ptr) {
+ return q.memcpy(ptr, data_, lut_depth * sizeof(float));
+ }
+
+ const float& operator[](int i) const { return data_[i]; }
+
+ private:
+ static constexpr int lut_depth = std::numeric_limits::max() -
+ std::numeric_limits::min() + 1;
+ float data_[lut_depth];
+};
+
+#endif /* __INTENSITY_SIGMA_LUT_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/main.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/main.cpp
new file mode 100644
index 0000000000..09fa160f20
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/main.cpp
@@ -0,0 +1,428 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "anr.hpp"
+#include "anr_params.hpp"
+#include "constants.hpp"
+#include "data_bundle.hpp"
+#include "dma_kernels.hpp"
+// dpc_common.hpp can be found in the dev-utilities include folder.
+// e.g., $ONEAPI_ROOT/dev-utilities/include/dpc_common.hpp
+#include "dpc_common.hpp"
+#include "mp_math.hpp"
+
+using namespace sycl;
+using namespace std::chrono;
+
+////////////////////////////////////////////////////////////////////////////////
+// Forward declare functions used in this file by main()
+void ParseFiles(std::string data_dir, std::vector& in_pixels,
+ std::vector& ref_pixels, int& cols, int& rows,
+ ANRParams& params);
+
+void WriteOutputFile(std::string data_dir, std::vector& pixels,
+ int cols, int rows);
+
+double RunANR(queue& q, PixelT* in_ptr, PixelT* out_ptr, int cols, int rows,
+ int frames, ANRParams params, float* sig_i_lut_data_ptr);
+
+bool Validate(PixelT* val, PixelT* ref, int rows, int cols,
+ double psnr_thresh = kPSNRDefaultThreshold);
+////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char* argv[]) {
+ /////////////////////////////////////////////////////////////
+ // reading and validating the command line arguments
+ std::string data_dir = "../test_data";
+ bool passed = true;
+#ifdef FPGA_EMULATOR
+ int runs = 2;
+ int frames = 2;
+#else
+ int runs = 2;
+ int frames = 8;
+#endif
+
+ // get the input data directory
+ if (argc > 1) {
+ data_dir = std::string(argv[1]);
+ }
+
+ // get the number of runs as the second command line argument
+ if (argc > 2) {
+ runs = atoi(argv[2]);
+ }
+
+ // get the number of frames as the third command line argument
+ if (argc > 3) {
+ frames = atoi(argv[3]);
+ }
+
+ // enforce at least two runs
+ if (runs < 2) {
+ std::cerr << "ERROR: 'runs' must be 2 or more\n";
+ std::terminate();
+ }
+
+ // enforce at least one batch
+ if (frames < 1) {
+ std::cerr << "ERROR: 'frames' must be atleast 1\n";
+ std::terminate();
+ }
+ /////////////////////////////////////////////////////////////
+
+ // the device selector
+#ifdef FPGA_EMULATOR
+ ext::intel::fpga_emulator_selector selector;
+#else
+ ext::intel::fpga_selector selector;
+#endif
+
+ // create the device queue
+ queue q(selector, dpc_common::exception_handler);
+
+ // make sure the device supports USM device allocations
+ auto d = q.get_device();
+ if (!d.get_info()) {
+ std::cerr << "ERROR: The selected device does not support USM device"
+ << " allocations\n";
+ std::terminate();
+ }
+
+ // parse the input files
+ int cols, rows, pixel_count;
+ ANRParams params;
+ std::vector in_pixels, ref_pixels;
+ ParseFiles(data_dir, in_pixels, ref_pixels, cols, rows, params);
+ pixel_count = cols * rows;
+
+ // create the output pixels (initialize to all 0s)
+ std::vector out_pixels(in_pixels.size(), 0);
+
+ // allocate memory on the device for the input and output
+ PixelT *in, *out;
+ if ((in = malloc_device(pixel_count, q)) == nullptr) {
+ std::cerr << "ERROR: could not allocate space for 'in'\n";
+ std::terminate();
+ }
+ if ((out = malloc_device(pixel_count, q)) == nullptr) {
+ std::cerr << "ERROR: could not allocate space for 'out'\n";
+ std::terminate();
+ }
+
+ // copy the input data to the device memory and wait for the copy to finish
+ q.memcpy(in, in_pixels.data(), pixel_count * sizeof(PixelT)).wait();
+
+ // allocate space for the intensity sigma LUT
+ float* sig_i_lut_data_ptr = IntensitySigmaLUT::AllocateDevice(q);
+
+ // create the intensity sigma LUT data locally on the host
+ IntensitySigmaLUT sig_i_lut_host(params);
+
+ // copy the intensity sigma LUT to the device
+ sig_i_lut_host.CopyDataToDevice(q, sig_i_lut_data_ptr).wait();
+ //////////////////////////////////////////////////////////////////////////////
+
+ // track timing information in ms
+ std::vector time(runs);
+
+ // print out some info
+ std::cout << "Runs: " << runs << "\n";
+ std::cout << "Columns: " << cols << "\n";
+ std::cout << "Rows: " << rows << "\n";
+ std::cout << "Frames: " << frames << "\n";
+ std::cout << "Filter Size: " << kFilterSize << "\n";
+ std::cout << "Pixels Per Cycle: " << kPixelsPerCycle << "\n";
+ std::cout << "Maximum Columns: " << kMaxCols << "\n";
+ std::cout << "\n";
+
+ try {
+ // run the design multiple times to increase the accuracy of the timing
+ for (int i = 0; i < runs; i++) {
+ // run ANR
+ time[i] =
+ RunANR(q, in, out, cols, rows, frames, params, sig_i_lut_data_ptr);
+
+ // Copy the output back from the device
+ q.memcpy(out_pixels.data(), out, pixel_count * sizeof(PixelT)).wait();
+
+ // validate the output on the last iteration
+ if (i == (runs-1)) {
+ passed &= Validate(out_pixels.data(), ref_pixels.data(), rows, cols);
+ } else {
+ passed &= true;
+ }
+ }
+ } catch (exception const& e) {
+ std::cout << "Caught a synchronous SYCL exception: " << e.what() << "\n";
+ std::terminate();
+ }
+
+ // free the allocated device memory
+ sycl::free(in, q);
+ sycl::free(out, q);
+ sycl::free(sig_i_lut_data_ptr, q);
+
+ // write the output files if device memory was used (output is meaningless
+ // otherwise)
+ WriteOutputFile(data_dir, out_pixels, cols, rows);
+
+ // print the performance results
+ if (passed) {
+ // NOTE: when run in emulation, these results do not accurately represent
+ // the performance of the kernels in actual FPGA hardware
+ double avg_time_ms =
+ std::accumulate(time.begin() + 1, time.end(), 0.0) / (runs - 1);
+
+ size_t input_count_mega = pixel_count * frames * sizeof(PixelT) * 1e-6;
+
+ std::cout << "Execution time: " << avg_time_ms << " ms\n";
+ std::cout << "Throughput: " << (input_count_mega / (avg_time_ms * 1e-3))
+ << " MB/s\n";
+ std::cout << "PASSED\n";
+ return 0;
+ } else {
+ std::cout << "FAILED\n";
+ return 1;
+ }
+}
+
+// declare kernel and pipe names globally to reduce name mangling
+class ANRInPipeID;
+class ANROutPipeID;
+class InputKernelID;
+class OutputKernelID;
+
+//
+// Run the ANR algorithm on the device
+//
+double RunANR(queue& q, PixelT* in_ptr, PixelT* out_ptr, int cols, int rows,
+ int frames, ANRParams params, float* sig_i_lut_data_ptr) {
+ // the input and output pipe for the sorter
+ using PipeType = DataBundle;
+ using ANRInPipe = sycl::ext::intel::pipe;
+ using ANROutPipe = sycl::ext::intel::pipe;
+
+ // launch the input and output kernels that read from and write to the device
+ auto input_kernel_event =
+ SubmitInputDMA(q,
+ in_ptr, rows, cols, frames);
+
+ auto output_kernel_event =
+ SubmitOutputDMA(q,
+ out_ptr, rows, cols, frames);
+
+ // launch all ANR kernels
+ std::vector> anr_kernel_events(frames);
+ for (int i = 0; i < frames; i++) {
+ anr_kernel_events[i] =
+ SubmitANRKernels(q, cols, rows, params,
+ sig_i_lut_data_ptr);
+ }
+
+ // wait for the input and output kernels to finish
+ auto start = high_resolution_clock::now();
+ input_kernel_event.wait();
+ output_kernel_event.wait();
+ auto end = high_resolution_clock::now();
+
+ // wait for the ANR kernels to finish
+ for (auto& one_event_set : anr_kernel_events) {
+ for (auto& e : one_event_set) {
+ e.wait();
+ }
+ }
+
+ // return the duration in milliseconds, excluding memory transfers
+ duration diff = end - start;
+ return diff.count();
+}
+
+//
+// Helper to parse pixel data files
+//
+void ParseDataFile(std::string filename, std::vector& pixels, int& cols,
+ int& rows) {
+ // create the file stream to parse
+ std::ifstream ifs(filename);
+
+ // make sure we opened the file
+ if (!ifs.is_open() || ifs.fail()) {
+ std::cerr << "ERROR: failed to open " << filename << " for reading\n";
+ std::terminate();
+ }
+
+ // get the header and data
+ std::string header_str, data_str;
+ if (!std::getline(ifs, header_str)) {
+ std::cerr << "ERROR: failed to get header line from " << filename << "\n";
+ std::terminate();
+ }
+ if (!std::getline(ifs, data_str)) {
+ std::cerr << "ERROR: failed to get data line from " << filename << "\n";
+ std::terminate();
+ }
+
+ // first two elements are the image dimensions
+ std::stringstream header_ss(header_str);
+ header_ss >> rows >> cols;
+
+ // expecting to parse cols*rows pixels from the 'data_str' line
+ pixels.resize(cols * rows);
+
+ // parse all of the pixels
+ std::stringstream data_ss(data_str);
+ for (int i = 0; i < cols * rows; i++) {
+ // parse using 64 bit integer
+ TmpT x;
+
+ // parse the pixel value
+ if (!(data_ss >> x)) {
+ std::cerr << "ERROR: ran out of pixels when parsing " << filename << "\n";
+ std::terminate();
+ }
+
+ // check for parsing failure
+ if (data_ss.fail()) {
+ std::cerr << "ERROR: failed to parse pixel in " << filename << "\n";
+ std::terminate();
+ }
+
+ // check if the parsed value fits in the pixel type
+ if (x > static_cast(std::numeric_limits::max())) {
+ std::cerr << "ERROR: value (" << x
+ << ") is too big to store in pixel type 'T'\n";
+ std::terminate();
+ }
+ if (x < static_cast(std::numeric_limits::min())) {
+ std::cerr << "ERROR: value (" << x
+ << ") is too small to store in pixel type 'T'\n";
+ std::terminate();
+ }
+
+ // set the value
+ pixels[i] = static_cast(x);
+ }
+}
+
+//
+// Function that parses all of the input files
+//
+void ParseFiles(std::string data_dir, std::vector& in_pixels,
+ std::vector& ref_pixels, int& cols, int& rows,
+ ANRParams& params) {
+ // parse the pixel data files
+ int noisy_w, noisy_h;
+ ParseDataFile(data_dir + "/input_noisy.data", in_pixels, noisy_w, noisy_h);
+ int ref_w, ref_h;
+ ParseDataFile(data_dir + "/output_ref.data", ref_pixels, ref_w, ref_h);
+
+ // ensure the dimensions match
+ if (noisy_w != ref_w) {
+ std::cerr << "noisy input and reference widths do not match " << noisy_w
+ << " != " << ref_w << "\n";
+ std::terminate();
+ }
+ if (noisy_h != ref_h) {
+ std::cerr << "noisy input and reference heights do not match " << noisy_h
+ << " != " << ref_h << "\n";
+ std::terminate();
+ }
+
+ // set the width and height
+ cols = ref_w;
+ rows = ref_h;
+
+ // parse the ANR config parameters file
+ params = ANRParams::FromFile(data_dir + "/param_config.data");
+
+ // ensure the parsed filter size matches the compile time constant
+ if (params.filter_size != kFilterSize) {
+ std::cerr << "ERROR: the filter size parsed from " << data_dir
+ << "/param_config.data (" << params.filter_size
+ << ") does not match the compile time constant filter size "
+ << "(kFilterSize = " << kFilterSize << ")\n";
+ std::terminate();
+ }
+
+ // ensure the parsed number of pixel bits matches the compile time constant
+ if (params.pixel_bits != kPixelBits) {
+ std::cerr << "ERROR: the number of bits per pixel parsed from " << data_dir
+ << "/param_config.data (" << params.pixel_bits
+ << ") does not match the compile time constant pixel size "
+ << "kPixelBits = " << kPixelBits << ")\n";
+ std::terminate();
+ }
+}
+
+//
+// Function to write the output to a file
+//
+void WriteOutputFile(std::string data_dir, std::vector& pixels,
+ int cols, int rows) {
+ std::string filename = data_dir + "/output.data";
+ std::ofstream ofs(filename);
+
+ // make sure we opened the file fine
+ if (!ofs.is_open() || ofs.fail()) {
+ std::cerr << "ERROR: failed to open " << filename << " for writing\n";
+ std::terminate();
+ }
+
+ // write the image dimensions
+ ofs << rows << " " << cols << "\n";
+
+ // write the pixels
+ for (auto& p : pixels) {
+ ofs << static_cast(p) << " ";
+ }
+}
+
+//
+// Validate the output pixels using Peak signal-to-noise ratio (PSNR)
+// https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+//
+// Also check the max individual pixel difference.
+//
+bool Validate(PixelT* val, PixelT* ref, int rows, int cols,
+ double psnr_thresh) {
+ // get the maximum value of the pixel
+ constexpr double max_i = std::numeric_limits::max();
+
+ // total number of pixels to check
+ int count = rows * cols;
+
+ // compute the MSE by summing the squared differences
+ // also find the maximum difference between the output pixel and the reference
+ double mse = 0.0;
+ for (int i = 0; i < count; i++) {
+ // cast to a double here because we are subtracting
+ auto diff = double(val[i]) - double(ref[i]);
+ mse += diff * diff;
+ }
+ mse /= count;
+
+ // compute the PSNR
+ double psnr = (20 * std::log10(max_i)) - (10 * std::log10(mse));
+
+ // check PSNR and maximum pixel difference
+ bool passed = true;
+ if (psnr <= psnr_thresh) {
+ std::cerr << "ERROR: Peak signal-to-noise ratio (PSNR) is too low: " << psnr
+ << "\n";
+ passed = false;
+ }
+
+ return passed;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/mp_math.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/mp_math.hpp
new file mode 100644
index 0000000000..78fe47bc06
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/mp_math.hpp
@@ -0,0 +1,161 @@
+#ifndef __MP_MATH__
+#define __MP_MATH__
+
+//
+// This file contains various helper C++ metaprogramming math functions that
+// are useful across various designs.
+//
+
+#include
+
+namespace hldutils {
+
+// returns the absolute value of 'x'
+template
+constexpr T Abs(T x) { return (x < 0) ? -x : x; }
+
+// returns the minimum of 'a' and 'b'.
+// The type, 'T', must have an operator<
+template
+constexpr T Min(T a, T b) { return (a < b) ? a : b; }
+
+// returns the maximum of 'a' and 'b'.
+// The type, 'T', must have an operator>
+template
+constexpr T Max(T a, T b) { return (a > b) ? a : b; }
+
+// returns n^2
+template
+constexpr T Pow2(T n) {
+ static_assert(std::is_integral::value);
+ static_assert(std::is_unsigned::value);
+ return T(1) << n;
+}
+
+// returns whether 'n' is a power of 2
+template
+constexpr bool IsPow2(T n) {
+ static_assert(std::is_integral::value);
+ static_assert(std::is_unsigned::value);
+ return (n != 0) && ((n & (n - 1)) == 0);
+}
+
+// returns log2(n) rounding down
+template
+constexpr T Log2(T n) {
+ static_assert(std::is_integral_v);
+ if (n < 2) {
+ return T(0);
+ } else {
+ T ret = 0;
+ while (n >= 2) {
+ ret++;
+ n /= 2;
+ }
+ return ret;
+ }
+}
+
+// returns log(2) rounded up
+template
+static constexpr T CeilLog2(T n) {
+ return ((n == 1) ? T(0) : Log2(n - 1) + T(1));
+}
+
+// return 'n' rounded up to the nearest power of 2
+template
+constexpr T RoundUpPow2(T n) {
+ static_assert(std::is_integral::value);
+ static_assert(std::is_unsigned::value);
+ if (n == 0) {
+ return 2;
+ } else if (IsPow2(n)) {
+ return n;
+ } else {
+ return T(1) << (Log2(n) + 1);
+ }
+}
+
+// computes x^y where y must be an integer (positive or negative)
+constexpr double Pow(double x, int y) {
+ if (y == 0) {
+ // x^0 = 1
+ return 1.0;
+ } else {
+ // handle both y < 0 and y > 0 by changing loop bound and multiply value
+ bool y_is_negative = (y < 0);
+ double mult_val = y_is_negative ? (1/x) : x;
+ int loop_bound = y_is_negative ? -y : y;
+
+ double ret = 1.0;
+ for (int i = 0; i < loop_bound; i++) {
+ ret *= mult_val;
+ }
+ return ret;
+ }
+}
+
+// estimates e^(x) for x >= 0 using a taylor series expansion
+// https://en.wikipedia.org/wiki/Taylor_series
+constexpr double Exp(double x, unsigned taylor_terms=32) {
+ double factorial = 1.0;
+ double power = 1.0;
+ double answer = 1.0;
+
+ for(int i = 1; i < taylor_terms-1; i++) {
+ power *= x;
+ factorial *= i;
+ answer += power / factorial;
+ }
+ return answer;
+}
+
+// Scale significand using floating-point base exponent
+// see: http://www.cplusplus.com/reference/cmath/scalbn/
+constexpr float Scalbn(float value, int exponent) {
+ if (exponent == 0) {
+ return value;
+ } else {
+ float ret = value;
+ while(exponent != 0) {
+ if (exponent > 0) {
+ ret *= 2;
+ exponent--;
+ } else {
+ ret /= 2;
+ exponent++;
+ }
+ }
+ return ret;
+ }
+}
+
+// extract the exponent from a 32-bit float
+constexpr int FP32ExtractExponent(float x) {
+ if (x == 0) {
+ return 0;
+ } else {
+ float ret = 0;
+ float abs_x = Abs(x);
+ while (abs_x >= 2 || abs_x < 1) {
+ bool abs_x_gte_2 = (abs_x >= 2);
+ ret += (abs_x_gte_2 ? 1 : -1);
+ x = (abs_x_gte_2 ? (x/2) : (x*2));
+ abs_x = Abs(x);
+ }
+ return ret;
+ }
+}
+
+// extract the mantissa from a 32-bit float
+constexpr int FP32ExtractMantissa(float x) {
+ // remove hidden 1 and bias the exponent to get integer
+ //#pragma clang fp contract(off)
+ //return (Abs(x) < std::numeric_limits::infinity()) ?
+ // Scalbn(Scalbn(Abs(x),-FP32ExtractExponent(x))-1,23) : 0;
+ return Scalbn(Scalbn(Abs(x),-FP32ExtractExponent(x))-1,23);
+}
+
+} // namespace hldutils
+
+#endif /* __MP_MATH__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp.hpp
new file mode 100644
index 0000000000..9d6a35b7fa
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp.hpp
@@ -0,0 +1,206 @@
+#ifndef __QFP_HPP__
+#define __QFP_HPP__
+
+#include
+#include
+
+#include
+
+#include "mp_math.hpp"
+
+//
+// A static class that is used to convert to/from 32-bit floating point
+// and quantized floating point (QFP) format. Note that, when converting from
+// FP32 to QFP, we truncate the mantissa, instead of rounding. This reduces
+// the area required for the conversion at the expense of decreased accuracy.
+//
+template
+struct QFP {
+ QFP(const QFP&) = delete;
+ QFP& operator=(const QFP&) = delete;
+
+ // determine if the QFP can fit into a unsigned char or unsigned short (if
+ // not, default to an unsigned int)
+ static constexpr bool fits_in_uchar =
+ qfp_total_bits <= sizeof(unsigned char)*8;
+ static constexpr bool fits_in_ushort =
+ qfp_total_bits <= sizeof(unsigned short)*8;
+
+ using qfp_type =
+ std::conditional_t>;
+
+ // 32-bit floating point bits based on
+ // https://en.wikipedia.org/wiki/Single-precision_floating-point_format
+ static constexpr unsigned kFP32SignBits = 1;
+ static constexpr unsigned kFP32ExponentBits = 8;
+ static constexpr unsigned kFP32MantissaBits = 23;
+ static constexpr unsigned kFP32TotalBits =
+ kFP32SignBits + kFP32ExponentBits + kFP32MantissaBits;
+ static constexpr int kFP32ExponentOffset =
+ (1 << (kFP32ExponentBits-1)) - 1;
+ static constexpr unsigned kFP32ExponentMask = (1 << kFP32ExponentBits) - 1;
+ static constexpr unsigned kFP32MantissaMask = (1 << kFP32MantissaBits) - 1;
+
+ // A union for accesing the mantissa, exponent, and sign bits
+ typedef union {
+ float f;
+ struct {
+ unsigned mantissa : kFP32MantissaBits;
+ unsigned exponent : kFP32ExponentBits;
+ unsigned sign : kFP32SignBits;
+ } parts;
+ } FloatCast;
+
+ // the number of mantissa bits for the QFP
+ static constexpr unsigned qfp_mantissa_bits =
+ qfp_total_bits - qfp_exponent_bits - is_signed;
+
+ // masks for the exponent and mantissa of the QFP
+ static constexpr unsigned qfp_mask = (1 << qfp_total_bits) - 1;
+ static constexpr unsigned qfp_exponent_mask = (1 << qfp_exponent_bits) - 1;
+ static constexpr unsigned qfp_mantissa_mask = (1 << qfp_mantissa_bits) - 1;
+ static constexpr int qfp_exponent_offset =
+ (1 << (qfp_exponent_bits - 1)) - 1;
+
+ // the difference in bits between the QFP and the 32-bit float
+ static constexpr unsigned mantissa_bit_diff =
+ kFP32MantissaBits - qfp_mantissa_bits;
+
+ // static asserts
+ static_assert(kFP32TotalBits == (sizeof(float) * 8));
+ static_assert(qfp_mantissa_bits <= kFP32MantissaBits);
+ static_assert(qfp_exponent_bits > 0);
+ static_assert(qfp_mantissa_bits > 0);
+ static_assert(qfp_total_bits > qfp_exponent_bits);
+
+ //
+ // convert from a 32-bit float to a QFP
+ //
+ static qfp_type FromFP32(float f) {
+ // use the float cast to get the parts of the FP32
+ FloatCast f_casted = {.f = f};
+ int fp32_sign = f_casted.parts.sign;
+ ac_int fp32_exponent =
+ f_casted.parts.exponent;
+ ac_int fp32_mantissa = f_casted.parts.mantissa;
+
+ // get the most significant qfp_mantissa_bits from the float's mantissa
+ // NOTE: we are doing truncation here without rounding, which will further
+ // reduce accuracy but require less area.
+ auto qfp_mantissa =
+ (fp32_mantissa >> mantissa_bit_diff) & qfp_mantissa_mask;
+
+ // compute the QFP exponent. Subtract the FP32 offset (127) from the FP32
+ // exponent and add back the QFP exponent offset.
+ auto qfp_exponent =
+ fp32_exponent - kFP32ExponentOffset + qfp_exponent_offset;
+
+ // get the sign bit
+ int qfp_sign = fp32_sign;
+
+ // build the output ac_int
+ if constexpr (is_signed) {
+ return qfp_type((qfp_sign << (qfp_exponent_bits + qfp_mantissa_bits)) |
+ (qfp_exponent << qfp_mantissa_bits) | (qfp_mantissa)) &
+ qfp_mask;
+ } else {
+ return qfp_type((qfp_exponent << qfp_mantissa_bits) | (qfp_mantissa)) &
+ qfp_mask;
+ }
+ }
+
+ //
+ // CONSTEXPR
+ // convert from a 32-bit float to a QFP
+ //
+ static constexpr qfp_type FromFP32CE(float f) {
+ // get the sign, exponent, and mantissa from the float
+ int fp32_sign = (f < 0) ? 1 : 0;
+ int fp32_exponent = FP32ExtractExponent(f) + kFP32ExponentOffset;
+ int fp32_mantissa = FP32ExtractMantissa(f);
+
+ // get the most significant qfp_mantissa_bits from the float's mantissa
+ // NOTE: we are doing truncation here, not rounding.
+ int qfp_mantissa =
+ (fp32_mantissa >> mantissa_bit_diff) & qfp_mantissa_mask;
+
+ // compute the QFP exponent. Subtract the FP32 offset (127) from the FP32
+ // exponent and add back the QFP exponent offset.
+ const int qfp_exponent_tmp =
+ (fp32_exponent == 0) ? 0 :
+ (int(fp32_exponent) - kFP32ExponentOffset + qfp_exponent_offset);
+ int qfp_exponent = (qfp_exponent_tmp < 0) ? 0 : qfp_exponent_tmp;
+
+ // get the sign bit
+ int qfp_sign = fp32_sign;
+
+ // build the output ac_int
+ if constexpr (is_signed) {
+ return qfp_type((qfp_sign << (qfp_exponent_bits + qfp_mantissa_bits)) |
+ (qfp_exponent << qfp_mantissa_bits) | (qfp_mantissa)) &
+ qfp_mask;
+ } else {
+ return qfp_type((qfp_exponent << qfp_mantissa_bits) | (qfp_mantissa)) &
+ qfp_mask;
+ }
+ }
+
+ //
+ // convert a QFP to a 32-bit float
+ //
+ static float ToFP32(qfp_type i) {
+ int sign_bit = 0;
+ if constexpr (!is_signed) {
+ sign_bit = (i >> (qfp_exponent_bits + qfp_mantissa_bits)) & 0x1;
+ }
+
+ ac_int fp32_exponent_tmp =
+ (i >> qfp_mantissa_bits) & qfp_exponent_mask;
+ auto fp32_exponent =
+ fp32_exponent_tmp - qfp_exponent_offset + kFP32ExponentOffset;
+ ac_int fp32_mantissa =
+ (i & qfp_mantissa_mask) << mantissa_bit_diff;
+
+ FloatCast f_casted;
+ f_casted.parts.sign = sign_bit;
+ f_casted.parts.exponent = fp32_exponent;
+ f_casted.parts.mantissa = fp32_mantissa;
+
+ return f_casted.f;
+ }
+
+ //
+ // CONSTEXPR
+ // convert a QFP to a 32-bit float
+ //
+ static constexpr float ToFP32CE(qfp_type i) {
+ int sign_bit = 0;
+ if constexpr (!is_signed) {
+ sign_bit = (i >> (qfp_exponent_bits + qfp_mantissa_bits)) & 0x1;
+ }
+ int fp32_exponent_tmp =
+ int((i >> qfp_mantissa_bits) & qfp_exponent_mask);
+ int fp32_exponent =
+ fp32_exponent_tmp - qfp_exponent_offset + kFP32ExponentOffset;
+ int fp32_mantissa = (i & qfp_mantissa_mask) << mantissa_bit_diff;
+
+ int offset_exponent =
+ (fp32_exponent == 0) ? 0 : (fp32_exponent - kFP32ExponentOffset);
+ // https://en.wikipedia.org/wiki/Single-precision_floating-point_format
+ //compute the mantissa sum
+ float mantissa_sum = (fp32_exponent == 0) ? 0.0 : 1.0;
+ for (int i = 1; i <= kFP32MantissaBits; i++) {
+ mantissa_sum +=
+ ((fp32_mantissa >> (kFP32MantissaBits-i)) & 0x1) * Pow(2, -i);
+ }
+
+ if (sign_bit == 0) return Pow(2, offset_exponent) * mantissa_sum;
+ else return -1.0f * Pow(2, offset_exponent) * mantissa_sum;
+ }
+
+ private:
+ QFP();
+};
+
+#endif /* _QFP_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp_exp_lut.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp_exp_lut.hpp
new file mode 100644
index 0000000000..7921f3fdb0
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp_exp_lut.hpp
@@ -0,0 +1,44 @@
+#ifndef __QFP_EXP_LUT_HPP__
+#define __QFP_EXP_LUT_HPP__
+
+#include "qfp.hpp"
+#include "rom_base.hpp"
+
+// the QFP bits for the ExpLUT
+constexpr unsigned kExpQFPTotalBits = 10;
+constexpr unsigned kExpQFPExponentBits = 6;
+constexpr unsigned kExpLUTDepth = (1 << kExpQFPTotalBits);
+constexpr int kExpTaylorSeriesTerms = 70;
+
+static_assert(kExpQFPTotalBits >= kExpQFPExponentBits);
+static_assert(kExpTaylorSeriesTerms > 3);
+
+//
+// A LUT for computing exp(-x)
+// Uses ROMBase to create a ROM initialized with the values of exp(-x)
+// using quantized floating point (QFP) numbers for indices.
+//
+struct ExpLUT : ROMBase {
+ // the QFP format
+ using QFP = QFP;
+
+ // the functor used to initialize the ROM
+ // NOTE: anything called from within the functor's operator() MUST be
+ // constexpr or else you won't get a ROM
+ struct InitFunctor {
+ constexpr unsigned short operator () (int x) const {
+ // treat the ROM index as a QFP number and convert to a float (f) and use
+ // the float to compute exp(-f) (== 1/exp(f)) and initialize that entry
+ // of the ROM
+ float f = QFP::ToFP32CE(x);
+ float val = 1.0f / hldutils::Exp(f, kExpTaylorSeriesTerms);
+ return QFP::FromFP32CE(val);
+ }
+ constexpr InitFunctor() = default;
+ };
+
+ // constexpr constructor using the initializer above
+ constexpr ExpLUT() : ROMBase(InitFunctor()) {}
+};
+
+#endif /*__QFP_EXP_LUT_HPP__*/
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp_inv_lut.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp_inv_lut.hpp
new file mode 100755
index 0000000000..4ee7cedb25
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/qfp_inv_lut.hpp
@@ -0,0 +1,38 @@
+#ifndef __QFP_INV_LUT_HPP__
+#define __QFP_INV_LUT_HPP__
+
+#include "qfp.hpp"
+#include "rom_base.hpp"
+
+// the QFP bits for the Pow2LUT
+constexpr unsigned kInvQFPTotalBits = 10;
+constexpr unsigned kInvQFPExponentBits = 3;
+constexpr unsigned kInvLutDepth = (1 << kInvQFPTotalBits);
+static_assert(kInvQFPTotalBits >= kInvQFPExponentBits);
+
+//
+// A LUT for computing 1/x
+//
+struct InvLUT : ROMBase {
+ // the QFP format
+ using QFP = QFP;
+
+ // the functor used to initialize the ROM
+ // NOTE: anything called from within the functor's operator() MUST be
+ // constexpr or else you won't get a ROM
+ struct InitFunctor {
+ constexpr unsigned short operator () (int x) const {
+ // treat the ROM index as a QFP number and convert to a float (f) and use
+ // the float to compute 1/f and initialize that entry of the ROM
+ float f = QFP::ToFP32CE(x);
+ float val = 1.0f / f ;
+ return QFP::FromFP32CE(val);
+ }
+ constexpr InitFunctor() = default;
+ };
+
+ // constexpr constructor using the initializer above
+ constexpr InvLUT() : ROMBase(InitFunctor()) {}
+};
+
+#endif /* __QFP_INV_LUT_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/rom_base.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/rom_base.hpp
new file mode 100644
index 0000000000..1059474386
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/rom_base.hpp
@@ -0,0 +1,68 @@
+#ifndef __ROM_BASE_HPP__
+#define __ROM_BASE_HPP__
+
+#include
+
+//
+// A base class for creating a constexpr ROM.
+//
+// TEMPLATE PARAMETERS
+// T: the datatype stored in the ROM
+// _depth: the depth of the ROM
+//
+// EXAMPLE USAGE
+// To use the ROM, you must create a class that inherits from this class and
+// provides a constexpr functor to the constructor which determines how the
+// ROM is initialized. The following examples show two methods for creating
+// a ROM that stores x^2, where 'x' is the index into the ROM.
+//
+// USING A FUNCTOR
+// struct SquareFunctor {
+// constexpr float operator () (int x) const { return x * x }
+// constexpr SquareFunctor() = default;
+// };
+//
+// constexpr int lut_depth = 1024;
+// struct SquareLUT : ROMBase {
+// constexpr SquareLUT() : ROMBase(SquareFunctor()) {}
+// };
+//
+// USING A LAMDA
+// constexpr int lut_depth = 1024;
+// struct SquareLUT : ROMBase {
+// constexpr SquareLUT() : ROMBase(
+// [](int x) { return x * x; }) {}
+// };
+//
+template
+struct ROMBase {
+ // ensure a positive depth
+ static_assert(rom_depth > 0);
+
+ // allows the depth of the ROM to be queried
+ static constexpr int depth = rom_depth;
+
+ // allows the type stored in the ROM to be queried
+ using ValType = T;
+
+ // constexpr constructor that initializes the contents of the ROM
+ // using a user specified Functor. NOTE: the functor must be constexpr,
+ // which can be achieved with a lamda or by marking the operator() function
+ // as constexpr.
+ template
+ constexpr ROMBase(const InitFunctor& func) : data_() {
+ static_assert(std::is_invocable_r_v);
+
+ for (int i = 0; i < rom_depth; i++) {
+ data_[i] = func(i);
+ }
+ }
+
+ // only define a const operator[], since this is a ROM
+ const T& operator[](int i) const { return data_[i]; }
+
+ protected:
+ T data_[rom_depth];
+};
+
+#endif /* __ROM_BASE_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/row_stencil.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/row_stencil.hpp
new file mode 100644
index 0000000000..4fb98d6212
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/row_stencil.hpp
@@ -0,0 +1,147 @@
+#ifndef __ROW_STENCIL_HPP__
+#define __ROW_STENCIL_HPP__
+
+#include
+#include
+#include
+
+#include "data_bundle.hpp"
+#include "mp_math.hpp"
+#include "shift_reg.hpp"
+#include "unrolled_loop.hpp"
+
+using namespace sycl;
+using namespace hldutils;
+
+//
+// helper function to pad the number of columns based on the filter size
+//
+template
+IndexT PadColumns(IndexT cols) {
+ constexpr int kPaddingPixels = filter_size / 2;
+ return cols + 2 * kPaddingPixels;
+}
+
+//
+// Generic 1D row (i.e. horizontal) stencil.
+//
+// TEMPLATE PARAMETERS
+// InType: The input pixel type. This is read in by the row stencil
+// through a SYCL pipe. The pipe should be hold
+// 'parallel_cols' elements of this type using the
+// 'DataBundle' type (DataBundle).
+// OutType: The output pixel type. The same logic as the InType above.
+// The data written to the output type is
+// DataBundle
+// IndexT: The datatype used for indexing. This type should have
+// enough bits to count up to the number or rows and columns.
+// InPipe: The input pipe to stream in 'parallel_cols' 'InT' values.
+// OutPipe: The output pipe to stream out 'parallel_cols' 'OutT'
+// values.
+// filter_size: The filter size (i.e., the number of pixels to convolve).
+// parallel_cols: The number of columns to compute in parallel.
+// StencilFunction: The stencil callback functor, provided by the user, which
+// is called for every pixel to perform the actual
+// convolution. The function definition should be as follows:
+//
+// OutT MyStencilFunction(int, int, ShiftReg,
+// FunctionArgTypes...)
+//
+// The user can provide extra arguments to the callback by
+// using the FunctionArgTypes parameter pack.
+// FunctionArgTypes: The user-provided type parameter pack of the arguments to
+// pass to the callback function.
+//
+//
+// FUNCTION ARGUMENTS
+// rows: The number of rows in the image.
+// cols: The number of columns in the image.
+// computed by the IP is rows*cols.
+// zero_val: The 'zero' value for the stencil. This is used to pad
+// the columns of the image.
+// func: The user-defined functor. This is a callback that is called
+// to perform the 1D convolution.
+// stencil_args...: The parameter pack of arguments to be passed to the
+// user-defined callback functor.
+//
+template
+void RowStencil(IndexT rows, IndexT cols, const InType zero_val,
+ StencilFunction func, FunctionArgTypes... stencil_args) {
+ // types coming into and out of the kernel from pipes, respectively
+ using InPipeT = DataBundle;
+ using OutPipeT = DataBundle;
+
+ // number of pixels to pad to the columns with
+ constexpr int kPaddingPixels = filter_size / 2;
+
+ // the size of the shift register to hold the window
+ constexpr int kShiftRegSize = filter_size + parallel_cols - 1;
+ constexpr IndexT kColThreshLow = kPaddingPixels;
+
+ // static asserts to validate template arguments
+ static_assert(filter_size > 1);
+ static_assert(parallel_cols > 0);
+ static_assert(IsPow2(parallel_cols));
+ static_assert(std::is_integral_v);
+ static_assert(std::is_invocable_r_v,
+ FunctionArgTypes...>);
+
+ // constants
+ const IndexT col_thresh_high = cols + kPaddingPixels;
+ const IndexT padded_cols = PadColumns(cols);
+ const IndexT col_loop_bound = padded_cols / parallel_cols;
+
+ // the shift register
+ [[intel::fpga_register]] ShiftReg shifty_pixels;
+
+ // initialize the contents of the shift register
+ #pragma unroll
+ for (int i = 0; i < kShiftRegSize; i++) {
+ shifty_pixels[i] = zero_val;
+ }
+
+ // the main processing loop for the image
+ [[intel::initiation_interval(1)]]
+ for (IndexT row = 0; row < rows; row++) {
+ [[intel::initiation_interval(1)]]
+ for (IndexT col_loop = 0; col_loop < col_loop_bound; col_loop++) {
+ IndexT col = col_loop * parallel_cols;
+
+ // read from the input pipe if there are still pixels to read
+ InPipeT new_pixels(zero_val);
+ if (col < cols) {
+ new_pixels = InPipe::read();
+ }
+
+ // shift in the input pixels
+ shifty_pixels.ShiftMultiVals(new_pixels);
+
+ // Perform the convolution on the 1D window
+ OutPipeT out_data(OutType(0));
+ UnrolledLoop<0, parallel_cols>([&](auto stencil_idx) {
+ const int col_local = col + stencil_idx;
+ ShiftReg shifty_pixels_copy;
+
+ // first, make an offsetted copy of the shift register
+ UnrolledLoop<0, filter_size>([&](auto x) {
+ shifty_pixels_copy[x] = shifty_pixels[x + stencil_idx];
+ });
+
+ // call the user's callback function for the operator
+ out_data[stencil_idx] = func(row, (col_local - kColThreshLow),
+ shifty_pixels_copy, stencil_args...);
+ });
+
+ // write the output data if it is in range (i.e., it is a real pixel
+ // and not part of the padding)
+ if ((col >= kColThreshLow) && (col < col_thresh_high)) {
+ OutPipe::write(out_data);
+ }
+ }
+ }
+}
+
+#endif /* __ROW_STENCIL_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/shift_reg.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/shift_reg.hpp
new file mode 100644
index 0000000000..d428520029
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/shift_reg.hpp
@@ -0,0 +1,141 @@
+#ifndef __SHIFT_REG_HPP__
+#define __SHIFT_REG_HPP__
+
+#include "data_bundle.hpp"
+#include "unrolled_loop.hpp"
+
+namespace hldutils {
+
+//
+// A class to represent a shift register of depth 'depth' holding elements
+// of type 'T'.
+//
+template
+class ShiftReg {
+ T registers_[depth];
+
+ public:
+ // DO NOT Create a constructor for this; the compiler does not
+ // handle it well.
+ // empty default constructor since you should fill a shift-register by
+ // priming it, and if `T` is a struct, we might get a looping constructor.
+ ShiftReg() {}
+
+ // For a shift register with N columns, the first piece of data is inserted in
+ // index [N-1], and is read out of index [0].
+ //
+ // ```
+ // i=0 1 2
+ // ┌───┬───┬───┐
+ // out ◄─ │ r ◄─e ◄─g ◄─ input
+ // └───┴───┴───┘
+ // ```
+ void Shift(T &in) {
+ UnrolledLoop<0, (depth - 1)>([&](auto i) {
+ registers_[i] = registers_[i + 1];
+ });
+ registers_[depth - 1] = in;
+ }
+
+ template
+ void shiftSingleVal(T &in) {
+ UnrolledLoop<0, (depth - shift_amt)>([&](auto i) {
+ registers_[i] = registers_[i + shift_amt];
+ });
+
+ UnrolledLoop<(depth - shift_amt), depth>([&](auto i) {
+ registers_[i] = in;
+ });
+ }
+
+ template
+ void ShiftMultiVals(DataBundle &in) {
+ UnrolledLoop<0, (depth - shift_amt)>([&](auto i) {
+ registers_[i] = registers_[i + shift_amt];
+ });
+
+ UnrolledLoop<0, shift_amt>([&](auto i) {
+ registers_[(depth - shift_amt) + i] = in[i];
+ });
+ }
+
+ // use an accessor like this to force static accesses
+ template
+ T Get() {
+ static_assert(idx < depth);
+ return registers_[idx];
+ }
+
+ T &operator[](int i) { return registers_[i]; }
+ const T &operator[](int i) const { return registers_[i]; }
+};
+
+//
+// A class to represent a 2D shift register with 'rows' rows of depth 'depth'
+// holding elements of type 'T'.
+//
+template
+class ShiftReg2d {
+ ShiftReg registers_[rows];
+
+ public:
+ // DO NOT Create constructor for this; the compiler does not handle it well.
+ // empty default constructor since you should fill a shift-register by
+ // priming it, and if `T` is a struct, we might get a looping constructor.
+ ShiftReg2d() {}
+
+ // For a shift register with M rows and N columns, the first piece of data is
+ // inserted in index [M-1][N-1], and is read out of index [0][0].
+ // j=0 1 2
+ // +----+----+----+
+ // out <- | <- | <- | <- | i=0
+ // +----+----+----+
+ // | ^- | <- | <- | i=1
+ // +----+----+----+
+ // | ^- | <- | <- | <- input i=2
+ // +----+----+----+
+ void Shift(T &in) {
+ UnrolledLoop<0, (rows - 1)>([&](auto i) {
+ registers_[i].Shift(registers_[i + 1][0]);
+ });
+ registers_[(rows - 1)].Shift(in);
+ }
+
+ // For a shift register with M rows and N columns, the first column of data
+ // is inserted in column [N-1], and is read out of column [0].
+ // j=0 1 2
+ // ┌───┬───┬───┐
+ // ◄─ r ◄ e ◄ g ◄─
+ // ├───┼───┼───┤
+ // ◄─ r ◄ e ◄ g ◄─
+ // ├───┼───┼───┤
+ // ◄─ r ◄ e ◄ g ◄─
+ // └───┴───┴───┘
+ void ShiftCol(T in[rows]) {
+ UnrolledLoop<0, rows>([&](auto i) {
+ registers_[i].Shift(in[i]);
+ });
+ }
+
+ template
+ void ShiftCols(DataBundle in[rows]) {
+ UnrolledLoop<0, rows>([&](auto i) {
+ registers_[i].template ShiftMultiVals(in[i]);
+ });
+ }
+
+ // use an accessor like this to force static accesses
+ template
+ T Get() {
+ static_assert(row < rows);
+ static_assert(col < depth);
+ return registers_[row][col];
+ }
+
+ ShiftReg &operator[](int i) { return registers_[i]; }
+ const ShiftReg &operator[](int i) const { return registers_[i]; }
+};
+
+} // namespace hldutils
+
+#endif /* __SHIFT_REG_HPP__ */
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/unrolled_loop.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/unrolled_loop.hpp
new file mode 100755
index 0000000000..5212bca1d7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/anr/src/unrolled_loop.hpp
@@ -0,0 +1,188 @@
+#ifndef __UNROLLEDLOOP_HPP__
+#define __UNROLLEDLOOP_HPP__
+#pragma once
+
+#include
+#include
+
+namespace hldutils {
+//
+// The code below creates the constexprs 'make_integer_range'
+// and 'make_index_range' these are akin to 'std::make_integer_sequence'
+// and 'std::make_index_sequence', respectively.
+// However they allow you to specificy a range and can either increment
+// or decrement, rather than a strict increasing sequence
+//
+template
+struct integer_range_impl;
+
+// incrementing case
+template
+struct integer_range_impl, begin, true> {
+ using type = std::integer_sequence;
+};
+
+// decrementing case
+template
+struct integer_range_impl, begin, false> {
+ using type = std::integer_sequence;
+};
+
+// integer_range
+template
+using integer_range = typename integer_range_impl,
+ begin,
+ (begin < end)>::type;
+
+//
+// make_integer_range
+//
+// USAGE:
+// make_integer_range{} ==> 1,2,...,9
+// make_integer_range{} ==> 10,9,...,2
+//
+template
+using make_integer_range = integer_range;
+
+//
+// make_index_range
+//
+// USAGE:
+// make_index_range<1,10>{} ==> 1,2,...,9
+// make_index_range<10,1>{} ==> 10,9,...,2
+//
+template
+using make_index_range = integer_range;
+
+//
+// The code below creates the constexprs 'make_integer_pow2_sequence'
+// and 'make_index_pow2_sequence'. These generate the sequence
+// 2^0, 2^1, 2^2, ... , 2^(N-1) = 1,2,4,...,2^(N-1)
+//
+template
+struct integer_pow2_sequence_impl;
+
+template
+struct integer_pow2_sequence_impl> {
+ using type = std::integer_sequence;
+};
+
+// integer_pow2_sequence
+template
+using integer_pow2_sequence =
+ typename integer_pow2_sequence_impl>::type;
+
+//
+// make_integer_pow2_sequence
+//
+// USAGE:
+// make_integer_pow2_sequence{} ==> 1,2,4,8,16
+//
+template
+using make_integer_pow2_sequence = integer_pow2_sequence;
+
+//
+// make_index_pow2_sequence
+//
+// USAGE:
+// make_index_pow2_sequence<5>{} ==> 1,2,4,8,16
+//
+template
+using make_index_pow2_sequence = integer_pow2_sequence;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Example usage for UnrolledLoop constexpr:
+//
+// Base
+// UnrolledLoop(std::integer_sequence{},[&](auto i) {
+// /* i = 5,2,7,8 */
+// });
+//
+// Case A
+// UnrolledLoop<10>([&](auto i) {
+// /* i = 0,1,...,9 */
+// });
+//
+// Case B
+// UnrolledLoop<10>([&](auto i) {
+// /* i = 0,1,...,9 */
+// });
+//
+// Case C
+// UnrolledLoop([&](auto i) {
+// /* i = 1,2,...,9 */
+// });
+// UnrolledLoop([&](auto i) {
+// /* i = 10,9,...,2 */
+// });
+//
+// Case D
+// UnrolledLoop<1, 10>([&](auto i) {
+// /* i = 1,2,...,9 */
+// });
+// UnrolledLoop<10, 1>([&](auto i) {
+// /* i = 10,9,...,2 */
+// });
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//
+// Base implementation
+// Templated on:
+// ItType - the type of the iterator (size_t, int, char, ...)
+// ItType... - the indices to iterate on
+// F - the function to run for each index (i.e. the lamda)
+//
+template
+constexpr void UnrolledLoop(std::integer_sequence, F&& f) {
+ (f(std::integral_constant{}), ...);
+}
+
+//
+// Convience implementation (A)
+// performs UnrolledLoop in range [0,n) with iterator of type ItType
+//
+template
+constexpr void UnrolledLoop(F&& f) {
+ UnrolledLoop(std::make_integer_sequence{}, std::forward(f));
+}
+
+//
+// Convenience implementation (B)
+// performs UnrolledLoop in range [0,n) with an iterator of type std::size_t
+//
+template
+constexpr void UnrolledLoop(F&& f) {
+ UnrolledLoop(std::make_index_sequence{}, std::forward(f));
+}
+
+//
+// Convenience implementation (C)
+// performs UnrolledLoop from start...end with an iterator of type ItType
+// NOTE: start is INCLUSIVE, end is EXCLUSIVE
+// NOTE: if start<=end, sequence is start,start+1,...,end-1
+// if end<=start, sequence is start,start-1,...,end+1
+//
+template
+constexpr void UnrolledLoop(F&& f) {
+ UnrolledLoop(make_integer_range{}, std::forward(f));
+}
+
+//
+// Convenience implementation (C)
+// performs UnrolledLoop from start...end with an iterator of type size_t
+// NOTE: start is INCLUSIVE, end is EXCLUSIVE
+// NOTE: if start<=end, sequence is start,start+1,...,end-1
+// if end<=start, sequence is start,start-1,...,end+1
+//
+template
+constexpr void UnrolledLoop(F&& f) {
+ UnrolledLoop(make_index_range{}, std::forward