oneapi-src · jimmytwei · Sep 18, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 18, 2023
diff --git a/DirectProgramming/Fortran/EdgeDetection/simple-binary-images/License.txt b/DirectProgramming/Fortran/EdgeDetection/simple-binary-images/License.txt
@@ -0,0 +1,7 @@
+Copyright 2020 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/Fortran/EdgeDetection/simple-binary-images/Makefile b/DirectProgramming/Fortran/EdgeDetection/simple-binary-images/Makefile
@@ -0,0 +1,52 @@
+##=============================================================
+## Copyright © 2020 Intel Corporation
+##
+## SPDX-License-Identifier: MIT
+## =============================================================
+##
+##**************************************************************
+## To compile and run the do concurrent examples: make run_dc
+## To compile and run the for-loop examples: make run_omp
+## To compile and run all examples: make run_all
+##**************************************************************
+
+default: run_all
+
+run_all: run_dc run_omp
+
+run_dc: img_seg_do_conc_cpu_seq img_seg_do_conc_cpu_par img_seg_do_conc_gpu
+	./img_seg_do_conc_cpu_seq -n 12 -o 2 -i 1 -d
+	./img_seg_do_conc_cpu_par -n 12 -o 2 -i 1 -d
+	OMP_TARGET_OFFLOAD=MANDATORY ./img_seg_do_conc_gpu -n 12 -o 2 -i 1 -d
+
+run_omp: img_seg_cpu img_seg_omp_cpu img_seg_omp_gpu
+	./img_seg_cpu -n 12 -o 2 -i 1 -d
+	./img_seg_omp_cpu -n 12 -o 2 -i 1 -d
+	OMP_TARGET_OFFLOAD=MANDATORY ./img_seg_omp_gpu -n 12 -o 2 -i 1 -d
+
+OMP_OPTS = -qopenmp
+GPU_OPTS = -fopenmp-targets=spir64 -fopenmp-target-do-concurrent
+
+img_seg_do_conc_cpu_seq: img_seg_do_concurrent.F90
+	ifx $< -o $@
+
+img_seg_do_conc_cpu_par: img_seg_do_concurrent.F90
+	ifx $< -o $@ $(OMP_OPTS)
+
+img_seg_do_conc_gpu: img_seg_do_concurrent.F90
+	ifx $< -o $@ $(OMP_OPTS) $(GPU_OPTS)
+
+img_seg_cpu: img_seg_omp_target.F90
+	ifx $< -o $@
+
+img_seg_omp_cpu: img_seg_omp_target.F90
+	ifx $< -o $@ $(OMP_OPTS)
+
+img_seg_omp_gpu: img_seg_omp_target.F90
+	ifx $< -o $@ $(OMP_OPTS) $(GPU_OPTS)
+
+clean:
+	-rm -f img_seg_do_conc_cpu_seq img_seg_do_conc_cpu_par img_seg_do_conc_gpu
+	-rm -f img_seg_cpu img_seg_omp_cpu img_seg_omp_gpu
+
+.PHONY: clean all run_all run_dc run_omp
diff --git a/DirectProgramming/Fortran/EdgeDetection/simple-binary-images/README.md b/DirectProgramming/Fortran/EdgeDetection/simple-binary-images/README.md
@@ -0,0 +1,118 @@
+# Simple Edge Detection Sample
+Segmentation is a common operation in image processing to find the boundaries of objects in an image.
+This sample implements a simple edge detection algorithm to find object boundaries in a binary image.
+However, this sample is more about offloading Fortran code to a GPU than it is about edge detection.
+The algorithm is implemented in two different but functionally equivalent ways. First, it is implemented
+using ordinary nested for-loops that are parallelized using OpenMP directives. Second, it is implemented
+using a single DO CONCURRENT loop, which is parallelized using the OpenMP backend. In either case, the
+Intel&reg; OpenMP runtime library is capable of offloading the edge detection loops to a GPU.
+
+| Optimized for       | Description
+|:---                 |:---
+| OS                  | Linux* Ubuntu* 18.04 or newer
+| Hardware            | Intel&reg; CPUs and GPUs
+| Software            | Intel&reg; Fortran Compiler
+| What you will learn | How to offload Fortran loops to a GPU
+| Time to complete    | 15 minutes
+
+## Purpose
+This sample demonstrates two Fortran implementations of edge detection:
+
+ 1. img_seg_omp_target.F90 implements edge detection on binary images using ordinary for-loops and OpenMP target directives
+ 2. img_seg_do_concurrent.F90 implements edge detection on binary images using only a DO CONCURRENT loop
+
+The implementations are functionally equivalent. In both cases, the OpenMP runtime library is used to parallelize the
+edge detection loops, regardless of whether they are run on the CPU or offloaded to a GPU.
+
+## Key Implementation Details
+[Using Fortran DO CONCURRENT for Accelerator Offload](https://www.intel.com/content/www/us/en/developer/articles/technical/using-fortran-do-current-for-accelerator-offload.html) provides more detailed descriptions of each example code, and discusses the relative merits of each approach.
+
+## Using Visual Studio Code* (Optional)
+
+You can use Visual Studio Code (VS Code) extensions to set your environment, create launch configurations,
+and browse and download samples.
+
+The basic steps to build and run a sample using VS Code include:
+ - Download a sample using the extension **Code Sample Browser for Intel oneAPI Toolkits**.
+ - Configure the oneAPI environment with the extension **Environment Configurator for Intel oneAPI Toolkits**.
+ - Open a Terminal in VS Code (**Terminal>New Terminal**).
+ - Run the sample in the VS Code terminal using the instructions below.
+ - (Linux only) Debug your GPU application with GDB for Intel® oneAPI toolkits using the **Generate Launch Configurations** extension.
+
+To learn more about the extensions, see
+[Using Visual Studio Code with Intel® oneAPI Toolkits](https://www.intel.com/content/www/us/en/develop/documentation/using-vs-code-with-intel-oneapi/top.html).
+
+After learning how to use the extensions for Intel oneAPI Toolkits, return to this readme for instructions on how to build and run a sample.
+
+## Building and Running this sample
+
+> **Note**: If you have not already done so, set up your CLI
+> environment by sourcing  the `setvars` script located in
+> the root of your oneAPI installation.
+>
+> Linux Sudo: . /opt/intel/oneapi/setvars.sh
+>
+> Linux User: . ~/intel/oneapi/setvars.sh
+>
+>For more information on environment variables, see Use the setvars Script for [Linux or macOS](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html).
+
+### On a Linux System
+Run `make` to build and run the sample. Six programs are generated:
+
+ 1. img_seg_cpu runs the for-loop implementation sequentially on the CPU
+ 2. img_seg_omp_cpu runs the for-loops in parallel on the CPU using OpenMP directives
+ 3. img_seg_omp_gpu offloads the for-loop in parallel on the GPU using OpenMP target directives
+ 4. img_seg_do_conc_cpu_seq runs the DO CONCURRENT implementation sequentially on the CPU
+ 5. img_seg_do_conc_cpu_par runs the DO CONCURRENT loop in parallel on the CPU
+ 6. img_seg_do_conc_gpu offloads the DO CONCURRENT loop to the GPU using the OpenMP backend
+
+You can remove all generated files with `make clean`.
+
+### Example of Output
+If everything is working correctly, each example program will perform edge detection on a small, randomly-generated binary
+image. It will display the original image followed by the outline of the objects in the image, e.g.:
+```
+OMP_TARGET_OFFLOAD=MANDATORY ./img_seg_omp_gpu -n 12 -o 2 -i 1 -d
+ Grid dimensions:          12
+ Number of images to process:           1
+ Number of objects in each image:           2
+
+ Binary image:
+  0  0  0  0  0  0  0  0  0  0  0  0
+  0  0  0  0  0  0  0  0  0  0  0  0
+  0  0  0  0  0  0  0  0  0  0  0  0
+  0  1  1  1  1  1  0  0  0  0  0  0
+  0  1  1  1  1  1  0  0  0  0  0  0
+  0  1  1  1  1  1  0  0  0  0  0  0
+  0  1  1  1  1  1  0  0  0  0  0  0
+  0  1  1  1  1  1  0  0  0  0  0  0
+  0  0  0  0  0  0  1  1  1  0  0  0
+  0  0  0  0  0  0  1  1  1  0  0  0
+  0  0  0  0  0  0  1  1  1  0  0  0
+  0  0  0  0  0  0  0  0  0  0  0  0
+
+ Edge mask:
+  -  -  -  -  -  -  -  -  -  -  -  -
+  -  -  -  -  -  -  -  -  -  -  -  -
+  -  -  -  -  -  -  -  -  -  -  -  -
+  -  T  T  T  T  T  -  -  -  -  -  -
+  -  T  -  -  -  T  -  -  -  -  -  -
+  -  T  -  -  -  T  -  -  -  -  -  -
+  -  T  -  -  -  T  -  -  -  -  -  -
+  -  T  T  T  T  T  -  -  -  -  -  -
+  -  -  -  -  -  -  T  T  T  -  -  -
+  -  -  -  -  -  -  T  -  T  -  -  -
+  -  -  -  -  -  -  T  T  T  -  -  -
+  -  -  -  -  -  -  -  -  -  -  -  -
+ Image           1 took  9.010000000000000E-004 seconds
+ Total time (not including first iteration):  0.000000000000000E+000 seconds
+```
+
+### Troubleshooting
+If an error occurs, troubleshoot the problem using the Diagnostics Utility for Intel® oneAPI Toolkits.
+[Learn more](https://www.intel.com/content/www/us/en/develop/documentation/diagnostic-utility-user-guide/top.html)
+
+## License
+Code samples are licensed under the MIT license. See [License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.
+
+Third party program Licenses can be found here: [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt)
diff --git a/DirectProgramming/Fortran/EdgeDetection/simple-binary-images/img_seg_do_concurrent.F90 b/DirectProgramming/Fortran/EdgeDetection/simple-binary-images/img_seg_do_concurrent.F90
@@ -0,0 +1,168 @@
+!===============================================================================
+!
+! Content:
+!     Implement edge detection on simple binary images using a standard Fortran
+!     DO CONCURRENT loop. The compiler will offload the loop to a GPU using the
+!     OpenMP runtime.
+!
+! Compile for CPU (sequential):
+!     ifx img_seg_do_concurrent.F90 -o img_seg_do_conc_cpu_seq
+!
+! Compile for CPU (parallel):
+!     ifx img_seg_do_concurrent.F90 -o img_seg_do_conc_cpu_par -qopenmp
+!
+! Compile for GPU using the OpenMP backend:
+!     ifx img_seg_do_concurrent.F90 -o img_seg_do_conc_gpu -qopenmp \
+!         -fopenmp-targets=spir64 -fopenmp-target-do-concurrent
+!
+!===============================================================================
+program img_seg_do_conc_example
+    implicit none
+
+    integer :: n = 8, objects = 3, images = 1
+    logical :: display = .false.
+    integer :: i, j, img_i, allocstat, stat
+
+    integer, allocatable :: image(:,:)
+    logical, allocatable :: edge_mask(:,:)
+
+    character (len = 132) :: allocmsg
+    character (len =  32) :: arg1, arg2
+
+    integer (kind=8) :: start_time, end_time, clock_precision
+    real    (kind=8) :: cycle_time, total_time = 0.0d0
+
+    call process_command_line()
+    call system_clock(count_rate = clock_precision)
+
+    ! Allocate image and edge mask
+    allocate (image(n, n), source = 0, stat = allocstat, errmsg = allocmsg)
+    if (allocstat > 0) stop trim(allocmsg)
+
+    allocate (edge_mask(n, n), source = .false., stat = allocstat, errmsg = allocmsg)
+    if (allocstat > 0) stop trim(allocmsg)
+
+    ! Process images
+    do img_i = 1, images
+        call initialize_image()
+        if (display) call display_image()
+
+        call system_clock(start_time)   ! Start timer
+
+        ! Outline the objects in the binary image
+        do concurrent (j = 1:n, i = 1:n, image(i, j) /= 0)
+            if (i == 1 .or. i == n .or. &
+                j == 1 .or. j == n) then
+                edge_mask(i, j) = .true.
+            else
+                if (any(image(i-1:i+1, j-1:j+1) == 0)) edge_mask(i, j) = .true.
+            endif
+        enddo
+
+        call system_clock(end_time)   ! Stop timer
+        cycle_time = dble(end_time - start_time) / dble(clock_precision)
+
+        if (display) call display_edge_mask()
+
+        print *, 'Image', img_i, 'took', cycle_time, 'seconds'
+        if (img_i /= 1) total_time = total_time + cycle_time
+
+        edge_mask = .false.   ! Reset edge mask
+    enddo
+    print *, 'Total time (not including first iteration):', total_time, 'seconds'
+
+    deallocate(image, edge_mask)
+
+contains
+    subroutine initialize_image()
+        integer x, x_min, x_max, y, y_min, y_max, d
+        real :: rn(3)
+
+        image = 0
+
+        ! Create random regions of interest in the image
+        call random_seed()
+        do i = 1, objects
+            call random_number(rn)
+            d = 1 + floor(2 * rn(1))
+
+            x_min = d + 1
+            x_max = n - d
+            x = x_min + (x_max - x_min) * rn(2)
+
+            y_min = d + 1
+            y_max = n - d
+            y = y_min + (y_max - y_min) * rn(3)
+
+            image(x-d:x+d, y-d:y+d) = 1
+        enddo
+    end subroutine initialize_image
+
+    subroutine display_image()
+        print *
+        print *, 'Binary image:'
+        do j = 1, n
+            do i = 1, n
+                write(6, advance='no', fmt="(i3)") image(i, j)
+            enddo
+            print *
+        enddo
+    end subroutine display_image
+
+    subroutine display_edge_mask()
+        print *
+        print *, 'Edge mask:'
+        do j = 1, n
+            do i = 1, n
+                if (edge_mask(i, j)) then
+                    write(6, advance='no', fmt="(l3)") edge_mask(i, j)
+                else
+                    write(6, advance='no', fmt="(a3)") '-'
+                endif
+            enddo
+            print *
+        enddo
+    end subroutine display_edge_mask
+
+    subroutine process_command_line()
+        j = 1
+        do while (j <= command_argument_count())
+            call get_command_argument(j, arg1)
+            select case (arg1)
+                case ('-n')
+                    call get_command_argument(j+1, arg2)
+                    read(arg2, *, iostat=stat) n
+                    j = j + 2
+                case ('-o')
+                    call get_command_argument(j+1, arg2)
+                    read(arg2, *, iostat=stat) objects
+                    j = j + 2
+                case ('-i')
+                    call get_command_argument(j+1, arg2)
+                    read(arg2, *, iostat=stat) images
+                    j = j + 2
+                case ('-d')
+                    display = .true.
+                    j = j + 1
+                case ('-h')
+                    call print_help()
+                    stop
+                case default
+                    print *, 'Unrecognized command-line option: ', arg1
+                    call print_help()
+                    stop
+            end select
+        enddo
+        print *, 'Grid dimensions:', n
+        print *, 'Number of images to process:', images
+        print *, 'Number of objects in each image:', objects
+    end subroutine process_command_line
+
+    subroutine print_help()
+        print '(a,/)', 'Command-line options:'
+        print '(a)', '   -n #   image dimensions (integer)'
+        print '(a)', '   -o #   number of objects in image (integer), objects may overlap'
+        print '(a)', '   -i #   number of images to process (integer)'
+        print '(a)', '   -d     display image and object edge mask'
+    end subroutine print_help
+end program img_seg_do_conc_example