oneapi-src · mdbtucker · Apr 22, 2021 · Apr 12, 2021 · Apr 12, 2021 · Apr 12, 2021
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/README.md
@@ -122,62 +122,41 @@ You can compile and run this Reference Design in the Eclipse* IDE (in Linux*) an
 
  1. Run the sample on the FPGA emulator (the kernel executes on the CPU).
      ```
-     ./mvdr_beamforming.fpga_emu --in=../data         (Linux)
-     ./mvdr_beamforming.fpga_emu.exe --in=../data     (Windows)
+     ./mvdr_beamforming.fpga_emu 1024 ../data .          (Linux)
+     ./mvdr_beamforming.fpga_emu.exe 1024 ../data .      (Windows)
      ```
 
 2. Run the sample on the FPGA device.
      ```
-     ./mvdr_beamforming.fpga --in=../data             (Linux)
+     ./mvdr_beamforming.fpga 1024 ../data .              (Linux)
      ```
 
 ### Application Parameters
 
-| Argument                  | Description
-|---                        |---
-| `--in=<path to input>`    | Specifies the directory that contains the input files (default=`../data`)
-| `--out=<path to output>`  | Specifies the directory to produce output data to (default=`.`)
+| Argument Index        | Description
+|---                    |---
+| 0                     | The number of matrices (default=`1024`)
+| 1                     | The input directory (default=`../data`)
+| 2                     | The output directory (default=`.`)
 
 ### Example of Output
 You should see the following output in the console:
 
 ```
+Matrices:         1024
+Input Directory:  '../data'
+Output Directory: '.'
+
 Reading training data from '../data/A_real.txt and ../data/A_imag.txt
 Reading input data from ../data/X_real.txt and ../data/X_imag.txt
-Calculated sin(theta) values
-Launched MVDR kernels
-
-*** Basic single matrix and steering vectors test ***
-Launching consumer kernel
-Launching producer kernels
-Producer kernels finished
-Consumer kernels finished
-Writing output to ./out_real.txt and ./out_imag.txt
-Checking output data against ../data/small_expected_out_real.txt and ../data/small_expected_out_imag.txt
-Output data check succeeded
-
-*** Re-send single matrix test ***
-Re-sending Xrx and training data
-Producer kernels finished
-Consumer kernels finished
-Checking output data against ../data/small_expected_out_real.txt and ../data/small_expected_out_imag.txt
-Output data check succeeded
-
-*** Modify weight vectors test (expect data mismatch) ***
-Modifying and sending sin(theta) values
-Re-sending Xrx and training data two times
-Output data mismatched as expected
-Restoring original sin(theta)[0] value
-Re-sending Xrx and training data two times
-Checking output data against ../data/small_expected_out_real.txt and ../data/small_expected_out_imag.txt
-Output data check succeeded
 
 *** Launching throughput test of 1024 matrices ***
 Sensor inputs                 : 16
 Training matrix rows          : 48
 Data rows per training matrix : 48
 Steering vectors              : 25
 Throughput: 34.6133 matrices/second
+Throughput: 82.5219 matrices/second
 Checking output data against ../data/small_expected_out_real.txt and ../data/small_expected_out_imag.txt
 Output data check succeeded
 PASSED
@@ -188,23 +167,27 @@ PASSED
 ### Source Code Breakdown
 | File                           | Description 
 |:---                            |:---
-|`mvdr_beamforming.cpp`          | Contains the `main()` function and the top-level interfaces to the MVDR functions.
+|`mvdr_beamforming.cpp`          | Contains the `main()` function and the top-level interfaces to the MVDR functions
 |`BackwardSubstitution.hpp`      | Backward Substitution kernel
 |`Beamformer.hpp`                | Beamformer kernel, multiplies input vectors by each weight vector to generate final output
 |`CalcWeights.hpp`               | CalcWeights kernel, multiplies BackwardSubstitution output by steering vectors
 |`Constants.hpp`                 | Defines constants used throught the design, some can be overridden from the command line during compiliation
 |`FakeIOPipes.hpp`               | Implements 'fake' IO pipes, which interface to the host
 |`ForwardSubstitution.hpp`       | Forward Substitution kernel
+|`InputDemux.hpp`                | InputDemux kernel, separates training and processing data
 |`mvdr_complex.hpp`              | Definition of ComplexType, used throughout this design
 |`MVDR.hpp`                      | Function to launch all MVDR kernels and define the pipes that connect them together
-|`NullPipe.hpp`                  | Defines the NullPipe class which allows pipe interfaces on kernels to be unused
-|`pipe_array.hpp`                | Header file containing the definition of an array of pipes. 
-|`pipe_array_internal.hpp`       | Helper for pipe_array.hpp. 
+|`ParallelCopyArray.hpp`         | Defines the ParallelCopyArray class, an array that supports unrolled copy / assign operations
+|`pipe_array.hpp`                | Header file containing the definition of an array of pipes
+|`pipe_array_internal.hpp`       | Helper for pipe_array.hpp
+|`PipeDuplicator.hpp`            | Defines the PipeDuplicator class, creates multiple copies of a pipe for fan-out
 |`SteeringVectorGenerator.hpp`   | SteeringVectorGenerator kernel, generates steering vectors based on data from the host
 |`StreamingQRD.hpp`              | StreamingQRD kernel, performs Q-R Decompostion on a matrix
 |`Transpose.hpp`                 | Transpose kernel, reorders data for the StreamingQRD kernel
 |`Tuple.hpp`                     | A templated tuple that defines the NTuple class which is used for pipe interfaces
-|`UnrolledLoop.hpp`              | A templated-based loop unroller that unrolls loops in the compiler front end 
+|`udp_loopback_test.cpp`         | Contains the `main()` function for the loopback test. This code is only relevant for use with real IO pipes
+|`UDP.hpp`                       | This code is **only** relevant for using the real IO pipes (i.e. not in the devcloud). This is discussed later in the [Using Real IO-pipes Section](#using-real-io-pipes)
+|`UnrolledLoop.hpp`              | A templated-based loop unroller that unrolls loops in the compiler front end
 
 ### MVDR Beamforming
 This reference design is built upon the **IO Streaming** code sample.
@@ -214,3 +197,64 @@ The images below show the dataflow in the MVDR beamforming design. The first ima
 <img src="processing_kernels_ideal.png" alt="processing_kernels_ideal" width="800"/>
 <img src="processing_kernels_fake.png" alt="processing_kernels_fake" width="800"/>
 
+### Using Real IO-pipes
+This section describes how to build and run this reference design on a BSP with real IO pipes. The real IO pipes version does **not** work on Windows and requires a specific system setup and BSP.
+
+#### Getting access to the BSP
+This design requires a specific board support package (BSP) with a distinct hardware configuration.  For access to this BSP or general customer support, submit a case through Intel&reg; Premier Support (IPS) or contact your Intel or Distribution Sales Representative.
+
+#### Building the loopback test and Reference Design with real IO pipes
+Use the following commands to generate a Makefile for building both the loopback test and reference design:
+```
+mkdir build
+cd build
+
+cmake .. -DREAL_IO_PIPES=1 -DFPGA_BOARD=pac_s10_usm_udp
+```
+
+The `REAL_IO_PIPES` cmake flag defines a variable that is used *exclusively* in `mvdr_beamforming.cpp` to create a kernel system using real IO pipes, as opposed to the fake IO pipes described earlier in this document.
+
+To build the loopback test, use the following command:
+```
+make udp_loopback_test
+```
+
+To build the MVDR reference design, use the following command:
+```
+make fpga
+```
+
+#### Running the loopback test and reference design with real IO pipes
+To run the loopback test, use the following command:
+```
+./udp_loopback_test.fpga 64:4C:36:00:2F:20 192.168.0.11 34543 255.255.255.0 94:40:C9:71:8D:10 192.168.0.10 34543 10000000
+```
+
+| Argument Index        | Description
+|---                    |---
+| 1                     | FPGA MAC Address
+| 2                     | FPGA IP Address
+| 3                     | FPGA UDP Port
+| 4                     | FPGA Netmask
+| 5                     | Host MAC Address
+| 6                     | Host IP Address
+| 7                     | Host UDP Port
+| 8                     | Number of packets (optional, default=`100000000`)
+
+To run the MVDR reference design with real IO pipes, use the following command:
+```
+./mvdr_beamforming.fpga 64:4C:36:00:2F:20 192.168.0.11 34543 255.255.255.0 94:40:C9:71:8D:10 192.168.0.10 34543 1024 ../data .
+```
+
+| Argument Index        | Description
+|---                    |---
+| 1                     | FPGA MAC Address
+| 2                     | FPGA IP Address
+| 3                     | FPGA UDP Port
+| 4                     | FPGA Netmask
+| 5                     | Host MAC Address
+| 6                     | Host IP Address
+| 7                     | Host UDP Port
+| 8                     | The number of matrices (optional, default=`1024`)
+| 9                     | The input directory (optional, default=`../data`)
+| 10                    | The output directory (optional, default=`.`)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/mvdr_beamforming.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/mvdr_beamforming.vcxproj
@@ -20,12 +20,13 @@
     <ClInclude Include="src\Constants.hpp" />
     <ClInclude Include="src\FakeIOPipes.hpp" />
     <ClInclude Include="src\ForwardSubstitution.hpp" />
+    <ClInclude Include="src\InputDemux.hpp" />
     <ClInclude Include="src\mvdr_complex.hpp" />
     <ClInclude Include="src\MVDR.hpp" />
-    <ClInclude Include="src\NullPipe.hpp" />
     <ClInclude Include="src\ParallelCopyArray.hpp" />
-    <ClInclude Include="src\pipe_array_internal.hpp" />
     <ClInclude Include="src\pipe_array.hpp" />
+    <ClInclude Include="src\pipe_array_internal.hpp" />
+    <ClInclude Include="src\PipeDuplicator.hpp" />
     <ClInclude Include="src\SteeringVectorGenerator.hpp" />
     <ClInclude Include="src\StreamingQRD.hpp" />
     <ClInclude Include="src\Transpose.hpp" />

diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/mvdr_beamforming.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/mvdr_beamforming.vcxproj.user
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LocalDebuggerCommandArguments>--in=data</LocalDebuggerCommandArguments>
+    <LocalDebuggerCommandArguments>0 ./data</LocalDebuggerCommandArguments>
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">

diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Beamformer.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Beamformer.hpp
@@ -75,16 +75,12 @@ event SubmitBeamformerKernel(
 
   auto e = q.submit([&](handler& h) {
     h.single_task<BeamformerKernelName>([=] {
-      // count the number of xrx vectors that have been processed with the
-      // current weight vectors
-      int xrx_vectors_processed = 0;
-
       while (1) {
         CalcType weight_vectors[k_num_weight_vectors][kNumCalcTypePerVector];
 
         // load the weight vectors to be used with the next set of Xrx vectors
-        for (char vector_num = 0; vector_num < (char)k_num_weight_vectors;
-             vector_num++) {
+        for (unsigned char vector_num = 0;
+             vector_num < (unsigned char)k_num_weight_vectors; vector_num++) {
           // weights are loaded in reverse order
           for (short i = kNumCalcTypePerVector - 1; i >= 0; i--) {
             for (short j = (short)k_unroll_factor - 1; j >= 0; j--) {
@@ -126,8 +122,8 @@ event SubmitBeamformerKernel(
           ComplexType result[k_num_weight_vectors];
 
           // calculate an output vector for each weight vector
-          for (char vector_num = 0; vector_num < (char)k_num_weight_vectors;
-               vector_num++) {
+          for (unsigned char vector_num = 0;
+               vector_num < (unsigned char)k_num_weight_vectors; vector_num++) {
             // zero the accumulators
             UnrolledLoop<k_unroll_factor>([&](auto i) { accum_vector[i] = 0; });
 
@@ -148,8 +144,8 @@ event SubmitBeamformerKernel(
 
           }  // end of for( vector_num... )
 
-          for (char vector_num = 0; vector_num < (char)k_num_weight_vectors;
-               vector_num++) {
+          for (unsigned char vector_num = 0;
+               vector_num < (unsigned char)k_num_weight_vectors; vector_num++) {
             // send the result out
             DataOutPipe::write(result[vector_num]);
           }

diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/CMakeLists.txt
@@ -28,6 +28,25 @@ if(WIN32)
     set(WIN_FLAG "/EHsc")
 endif()
 
+# Allow the user to enable real IO pipes
+# e.g. cmake .. -DREAL_IO_PIPES=1
+if(REAL_IO_PIPES)
+    set(REAL_IO_PIPES_FLAG "-DREAL_IO_PIPES")
+    set(STREAMING_PIPE_WIDTH_FLAG "-DSTREAMING_PIPE_WIDTH=1")
+    set(UDP_LINK_FLAGS "-luuid -lopae-c -lpthread")
+    message(STATUS "Design is using real IO pipes")
+
+    # the real IO pipes version doesn't work on Windows, so error out
+    if(WIN32)
+      message(FATAL_ERROR "The real IO pipe design is only supported on Linux")
+    endif()
+endif()
+
+if(FLAT_COMPILE)
+  message(STATUS "Doing a flat compile")
+  set(FLAT_COMPILE_FLAG "-Xsbsp-flow=flat")
+endif()
+
 # Allow the user to enable hardware profiling
 # Profiling can be enabled when running cmake by adding the flag -DPROFILE_HW=1
 # e.g. cmake .. -DPROFILE_HW=1
@@ -54,17 +73,23 @@ if(QRD_MIN_ITERATIONS)
     set(QRD_MIN_ITERATIONS_FLAG "-DQRD_MIN_ITERATIONS=${QRD_MIN_ITERATIONS}")
 endif()
 
+# Allow the user to set the streaming pipe width for the input/output pipes
+# e.g. cmake .. -DSTREAMING_PIPE_WIDTH=2
+if(STREAMING_PIPE_WIDTH)
+    set(STREAMING_PIPE_WIDTH_FLAG "-DSTREAMING_PIPE_WIDTH=${STREAMING_PIPE_WIDTH}")
+endif()
+
 
 # A DPC++ ahead-of-time (AoT) compile processes the device code in two stages.
 # 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V).
 # 2. The "link" stage invokes the compiler's FPGA backend before linking.
 #    For this reason, FPGA backend flags must be passed as link flags in CMake.
-set(EMULATOR_COMPILE_FLAGS "${WIN_FLAG} -fintelfpga -fbracket-depth=512 ${ENABLE_USM} ${SENSOR_SIZE_FLAG} ${NUM_SENSORS_FLAG} ${QRD_MIN_ITERATIONS_FLAG} -DFPGA_EMULATOR")
+set(EMULATOR_COMPILE_FLAGS "-Wall ${WIN_FLAG} -fintelfpga -fbracket-depth=512 ${ENABLE_USM} ${SENSOR_SIZE_FLAG} ${NUM_SENSORS_FLAG} ${QRD_MIN_ITERATIONS_FLAG} ${STREAMING_PIPE_WIDTH_FLAG} -DFPGA_EMULATOR")
 set(EMULATOR_LINK_FLAGS "-fintelfpga ${ENABLE_USM}")
-set(SIMULATOR_COMPILE_FLAGS "-fintelfpga -fbracket-depth=512 ${ENABLE_USM} ${SENSOR_SIZE_FLAG} ${NUM_SENSORS_FLAG} ${QRD_MIN_ITERATIONS_FLAG}")
+set(SIMULATOR_COMPILE_FLAGS "-Wall -fintelfpga -fbracket-depth=512 ${ENABLE_USM} ${SENSOR_SIZE_FLAG} ${NUM_SENSORS_FLAG} ${QRD_MIN_ITERATIONS_FLAG} ${STREAMING_PIPE_WIDTH_FLAG}")
 set(SIMULATOR_LINK_FLAGS "-fintelfpga -fbracket-depth=512 -Xssimulation -Xsghdl")
-set(HARDWARE_COMPILE_FLAGS "${WIN_FLAG} -fbracket-depth=512 -fintelfpga ${ENABLE_USM} ${SENSOR_SIZE_FLAG} ${NUM_SENSORS_FLAG} ${QRD_MIN_ITERATIONS_FLAG}")
-set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -fbracket-depth=512 ${PROFILE_FLAG} ${ENABLE_USM} ${SENSOR_SIZE_FLAG} ${NUM_SENSORS_FLAG} ${QRD_MIN_ITERATIONS_FLAG} -Xsparallel=2 -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+set(HARDWARE_COMPILE_FLAGS "-Wall ${WIN_FLAG} -fbracket-depth=512 -fintelfpga ${ENABLE_USM} ${SENSOR_SIZE_FLAG} ${NUM_SENSORS_FLAG} ${QRD_MIN_ITERATIONS_FLAG} ${REAL_IO_PIPES_FLAG} ${STREAMING_PIPE_WIDTH_FLAG}")
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -fbracket-depth=512 ${PROFILE_FLAG} -Xsparallel=2 -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS} ${UDP_LINK_FLAGS}")
 # use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
 
 ###############################################################################
@@ -124,3 +149,15 @@ set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAG
 # The -reuse-exe flag enables rapid recompilation of host-only code changes.
 # See DPC++FPGA/GettingStarted/fast_recompile for details.
 
+
+###############################################################################
+# UDP Loopback test
+###############################################################################
+set(UDP_LOOPBACK_TARGET udp_loopback_test.fpga)
+set(UDP_LOOPBACK_COMPILE_FLAGS "-Wall -fintelfpga")
+set(UDP_LOOPBACK_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${FPGA_BOARD} ${FLAT_COMPILE_FLAG} ${USER_HARDWARE_FLAGS} ${UDP_LINK_FLAGS}")
+add_executable(${UDP_LOOPBACK_TARGET} EXCLUDE_FROM_ALL udp_loopback_test.cpp)
+add_custom_target(udp_loopback_test DEPENDS ${UDP_LOOPBACK_TARGET})
+set_target_properties(${UDP_LOOPBACK_TARGET} PROPERTIES COMPILE_FLAGS "${UDP_LOOPBACK_COMPILE_FLAGS}")
+set_target_properties(${UDP_LOOPBACK_TARGET} PROPERTIES LINK_FLAGS "${UDP_LOOPBACK_LINK_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${UDP_LOOPBACK_TARGET}")
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/FakeIOPipes.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/FakeIOPipes.hpp
@@ -232,8 +232,8 @@ class ConsumerImpl : public ProducerConsumerBaseImpl<Id, T, use_host_alloc> {
     auto kernel_ptr = BaseImpl::get_kernel_ptr();
 
     // launch the kernel to read the output into device side global memory
-    // NO-FORMAT comments are for clang-format
     auto kernel_event = q.submit([&](handler &h) {
+      // NO-FORMAT comments are for clang-format
       h.single_task<KernelID>([=
       ]() [[intel::kernel_args_restrict]] {  // NO-FORMAT: Attribute
         kernel_ptr_type ptr(kernel_ptr);

diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/ForwardSubstitution.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/ForwardSubstitution.hpp
@@ -96,7 +96,7 @@ event SubmitForwardSubstitutionKernel(queue& q) {
         // receive new y_vectors if they are available
         // This odd loop is a fusion of two loops with different trip counts.
         short col = 0, row = 0, i = 0, j = 0;
-        char vector_num = 0;
+        unsigned char vector_num = 0;
         for (int iteration = 0; iteration < kLoadLoopIterations; iteration++) {
           // Load the L and LDiagRecip values
           if (iteration < kNumLElements) {
@@ -135,8 +135,8 @@ event SubmitForwardSubstitutionKernel(queue& q) {
         }  // end of for(i...)
 
         // Loop through all the y vectors
-        for (char vector_num = 0; vector_num < (char)k_num_y_vectors;
-             vector_num++) {
+        for (unsigned char vector_num = 0;
+             vector_num < (unsigned char)k_num_y_vectors; vector_num++) {
           // y_vector_intial contains the unmodified current y vector.  y_vector
           // is used during processing.  Splitting these two vectors allows
           // each to be implemented in a local memory with only one read and