diff --git a/.gitignore b/.gitignore
index 948b5962eb..d032d3d5dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ GPATH
 include/af/version.h
 src/backend/version.hpp
 docs/details/examples.dox
+/TAGS
diff --git a/.gitmodules b/.gitmodules
index 395881a861..c91b7f1585 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "test/gtest"]
 	path = test/gtest
 	url = https://chromium.googlesource.com/external/googletest
+[submodule "src/backend/cpu/threads"]
+	path = src/backend/cpu/threads
+	url = https://github.com/alltheflops/threads.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c79fbcaab0..0def888f6c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12)
 PROJECT(ARRAYFIRE)
 
 SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
@@ -9,7 +9,6 @@ INCLUDE(AFInstallDirs)
 
 OPTION(BUILD_TEST "Build Tests" ON)
 OPTION(BUILD_EXAMPLES "Build Examples" ON)
-OPTION(BUILD_GTEST "Download gtest and check for updates. Necessary if you change compilers" ON)
 
 OPTION(BUILD_CPU "Build ArrayFire with a CPU backend" ON)
 
@@ -31,9 +30,6 @@ OPTION(BUILD_DOCS "Create ArrayFire Documentation" OFF)
 OPTION(WITH_COVERAGE "Added code coverage flags" OFF)
 
 OPTION(BUILD_NONFREE "Build ArrayFire nonfree algorithms" OFF)
-OPTION(BUILD_SIFT "Build ArrayFire nonfree algorithms" OFF)
-
-MARK_AS_ADVANCED(BUILD_SIFT)
 
 OPTION(BUILD_UNIFIED "Build Backend-Independent ArrayFire API" ON)
 
@@ -91,17 +87,18 @@ IF(BUILD_GRAPHICS)
 
 ENDIF(BUILD_GRAPHICS)
 
-IF(BUILD_NONFREE)
-  MESSAGE(WARNING "Building With NONFREE ON requires the following patents")
-  SET(BUILD_SIFT ON)
-ENDIF(BUILD_NONFREE)
+IF(${BUILD_NONFREE})
+    MESSAGE(WARNING "Building With NONFREE ON requires the following patents")
+    SET(BUILD_NONFREE_SIFT ON CACHE BOOL "Build ArrayFire with SIFT")
+    MARK_AS_ADVANCED(BUILD_NONFREE_SIFT)
+ELSE(${BUILD_NONFREE})
+    UNSET(BUILD_NONFREE_SIFT CACHE) # BUILD_NONFREE_SIFT cannot be built without BUILD_NONFREE
+ENDIF(${BUILD_NONFREE})
 
-IF(BUILD_SIFT)
-  ADD_DEFINITIONS(-DAF_BUILD_SIFT)
+IF(${BUILD_NONFREE_SIFT})
+  ADD_DEFINITIONS(-DAF_BUILD_NONFREE_SIFT)
 
-  IF (NOT BUILD_NONFREE)
-    MESSAGE(WARNING "Building with SIFT requires the following patents")
-  ENDIF()
+  MESSAGE(WARNING "Building with SIFT requires the following patents")
 
   MESSAGE("Method and apparatus for identifying scale invariant features"
     "in an image and use of same for locating an object in an image,\" David"
@@ -110,7 +107,7 @@ IF(BUILD_SIFT)
     "further details, contact David Lowe (lowe@cs.ubc.ca) or the"
     "University-Industry Liaison Office of the University of British"
     "Columbia.")
-ENDIF(BUILD_SIFT)
+ENDIF(${BUILD_NONFREE_SIFT})
 
 INCLUDE_DIRECTORIES(
     "${CMAKE_CURRENT_SOURCE_DIR}/include"
@@ -154,6 +151,10 @@ ELSE(${UNIX}) #Windows
         # http://www.kitware.com/blog/home/post/434
         SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /Gm-")
         SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} /MP /Gm-")
+
+        # Builds that contain debug info require /bigobj
+        SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
+        SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj")
     ENDIF(MSVC)
 ENDIF()
 
@@ -223,7 +224,7 @@ ENDIF(FORGE_FOUND AND NOT USE_SYSTEM_FORGE)
 SET(INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include")
 SET(BACKEND_DIR "src/backend/\${lowerbackend}")
 CONFIGURE_FILE(
-    ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfig.cmake.in
+    ${CMAKE_MODULE_PATH}/ArrayFireConfig.cmake.in
     ${CMAKE_CURRENT_BINARY_DIR}/ArrayFireConfig.cmake
     @ONLY)
 
@@ -233,11 +234,11 @@ STRING(REGEX REPLACE "[^/]+" ".." reldir "${AF_INSTALL_CMAKE_DIR}")
 SET(INCLUDE_DIR "\${CMAKE_CURRENT_LIST_DIR}/${reldir}/include")
 set(BACKEND_DIR)
 CONFIGURE_FILE(
-    ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfig.cmake.in
+    ${CMAKE_MODULE_PATH}/ArrayFireConfig.cmake.in
     ${CMAKE_CURRENT_BINARY_DIR}/Install/ArrayFireConfig.cmake
     @ONLY)
 CONFIGURE_FILE(
-    ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfigVersion.cmake.in
+    ${CMAKE_MODULE_PATH}/ArrayFireConfigVersion.cmake.in
     ${CMAKE_CURRENT_BINARY_DIR}/ArrayFireConfigVersion.cmake
     @ONLY)
 INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/Install/ArrayFireConfig.cmake
@@ -265,4 +266,4 @@ ENDIF(APPLE)
 ##
 # Packaging
 ##
-include(${CMAKE_CURRENT_SOURCE_DIR}/CPack.cmake)
+include(${CMAKE_MODULE_PATH}/CPackConfig.cmake)
diff --git a/ArrayFireConfig.cmake.in b/CMakeModules/ArrayFireConfig.cmake.in
similarity index 100%
rename from ArrayFireConfig.cmake.in
rename to CMakeModules/ArrayFireConfig.cmake.in
diff --git a/ArrayFireConfigVersion.cmake.in b/CMakeModules/ArrayFireConfigVersion.cmake.in
similarity index 100%
rename from ArrayFireConfigVersion.cmake.in
rename to CMakeModules/ArrayFireConfigVersion.cmake.in
diff --git a/CPack.cmake b/CMakeModules/CPackConfig.cmake
similarity index 98%
rename from CPack.cmake
rename to CMakeModules/CPackConfig.cmake
index 2e7f1d5a03..de242a99b7 100644
--- a/CPack.cmake
+++ b/CMakeModules/CPackConfig.cmake
@@ -1,6 +1,6 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 
-include("${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/Version.cmake")
+INCLUDE("${CMAKE_MODULE_PATH}/Version.cmake")
 
 # CPack package generation
 #SET(CPACK_GENERATOR "TGZ;STGZ")
diff --git a/CMakeModules/FindCBLAS.cmake b/CMakeModules/FindCBLAS.cmake
index b0cd3bdca0..db1d783e9e 100644
--- a/CMakeModules/FindCBLAS.cmake
+++ b/CMakeModules/FindCBLAS.cmake
@@ -53,19 +53,48 @@ SET(CBLAS_ROOT_DIR CACHE STRING
 INCLUDE(CheckTypeSize)
 CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
 
-SET(CBLAS_LIB_DIR)
+IF (NOT INTEL_MKL_ROOT_DIR)
+  SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT})
+ENDIF()
 
-SET(CBLAS_ROOT_DIR "${INTEL_MKL_ROOT_DIR}")
+IF(NOT CBLAS_ROOT_DIR)
 
-IF(CBLAS_ROOT_DIR)
-    IF(INTEL_MKL_ROOT_DIR)
-      IF ("${SIZE_OF_VOIDP}" EQUAL 8)
-        SET(CBLAS_LIB_DIR "${INTEL_MKL_ROOT_DIR}/lib/intel64")
-      ELSE()
-        SET(CBLAS_LIB_DIR "${INTEL_MKL_ROOT_DIR}/lib/ia32")
-      ENDIF()
+  IF (ENV{CBLASDIR})
+    SET(CBLAS_ROOT_DIR $ENV{CBLASDIR})
+    IF ("${SIZE_OF_VOIDP}" EQUAL 8)
+        SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib64")
+    ELSE()
+        SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib")
+    ENDIF()
+  ENDIF()
+
+  IF (ENV{CBLAS_ROOT_DIR})
+    SET(CBLAS_ROOT_DIR $ENV{CBLAS_ROOT_DIR})
+    IF ("${SIZE_OF_VOIDP}" EQUAL 8)
+        SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib64")
+    ELSE()
+        SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib")
     ENDIF()
-    SET(CBLAS_INCLUDE_DIR "${INTEL_MKL_ROOT_DIR}/include")
+  ENDIF()
+
+  IF (INTEL_MKL_ROOT_DIR)
+    SET(CBLAS_ROOT_DIR ${INTEL_MKL_ROOT_DIR})
+    IF(APPLE)
+        IF ("${SIZE_OF_VOIDP}" EQUAL 8)
+            SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib")
+        ELSE()
+            SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib")
+        ENDIF()
+    ELSE(APPLE) # Windows and Linux
+        IF ("${SIZE_OF_VOIDP}" EQUAL 8)
+            SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib/intel64")
+        ELSE()
+            SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib/ia32")
+        ENDIF()
+    ENDIF(APPLE)
+  ENDIF()
+
+  SET(CBLAS_INCLUDE_DIR "${CBLAS_ROOT_DIR}/include")
 ENDIF()
 
 # Old CBLAS search
@@ -116,14 +145,14 @@ MACRO(CHECK_ALL_LIBRARIES
           NAMES ${_library}
           PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
           ENV DYLD_LIBRARY_PATH
-          "{CBLAS_LIB_DIR}"
+          "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}"
           )
       ELSE(APPLE)
         FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
           NAMES ${_library}
           PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
           ENV LD_LIBRARY_PATH
-          "${CBLAS_LIB_DIR}"
+          "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}"
           PATH_SUFFIXES atlas
           )
         IF(NOT ${_prefix}_${library}_LIBRARY)
@@ -132,7 +161,7 @@ MACRO(CHECK_ALL_LIBRARIES
               NAMES ${_library}
               PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
               ENV LD_LIBRARY_PATH
-              "${CBLAS_LIB_DIR}"
+              "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}"
               PATH_SUFFIXES atlas
               )
           ENDIF(NOT ${_prefix}_${library}_LIBRARY)
@@ -194,6 +223,23 @@ MACRO(CHECK_ALL_LIBRARIES
   ENDIF(NOT _libraries_work)
 ENDMACRO(CHECK_ALL_LIBRARIES)
 
+# MKL CBLAS library?
+IF(NOT CBLAS_LIBRARIES)
+  CHECK_ALL_LIBRARIES(
+    CBLAS_LIBRARIES
+    CBLAS
+    cblas_dgemm
+    ""
+    "mkl_rt"
+    "mkl_cblas.h"
+    FALSE,
+    TRUE)
+ENDIF(NOT CBLAS_LIBRARIES)
+
+IF(CBLAS_LIBRARIES)
+  SET(MKL_FOUND ON)
+ENDIF()
+
 # Apple CBLAS library?
 IF(NOT CBLAS_LIBRARIES)
   CHECK_ALL_LIBRARIES(
diff --git a/CMakeModules/FindFFTW.cmake b/CMakeModules/FindFFTW.cmake
index a725f64ecd..3156cec89b 100644
--- a/CMakeModules/FindFFTW.cmake
+++ b/CMakeModules/FindFFTW.cmake
@@ -24,6 +24,25 @@ IF(NOT FFTW_ROOT AND ENV{FFTWDIR})
     SET(FFTW_ROOT $ENV{FFTWDIR})
 ENDIF()
 
+IF (NOT INTEL_MKL_ROOT_DIR)
+  SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT})
+ENDIF()
+
+IF(NOT FFTW_ROOT)
+
+  IF (ENV{FFTWDIR})
+    SET(FFTW_ROOT $ENV{FFTWDIR})
+  ENDIF()
+
+  IF (ENV{FFTW_ROOT_DIR})
+    SET(FFTW_ROOT $ENV{FFTW_ROOT_DIR})
+  ENDIF()
+
+  IF (INTEL_MKL_ROOT_DIR)
+    SET(FFTW_ROOT ${INTEL_MKL_ROOT_DIR})
+  ENDIF()
+ENDIF()
+
 # Check if we can use PkgConfig
 FIND_PACKAGE(PkgConfig)
 
@@ -44,14 +63,14 @@ IF(FFTW_ROOT)
     #find libs
     FIND_LIBRARY(
         FFTW_LIB
-        NAMES "fftw3" "libfftw3-3" "fftw3-3"
+        NAMES "fftw3" "libfftw3-3" "fftw3-3" "mkl_rt"
         PATHS ${FFTW_ROOT}
         PATH_SUFFIXES "lib" "lib64"
         NO_DEFAULT_PATH
         )
     FIND_LIBRARY(
         FFTWF_LIB
-        NAMES "fftw3f" "libfftw3f-3" "fftw3f-3"
+        NAMES "fftw3f" "libfftw3f-3" "fftw3f-3" "mkl_rt"
         PATHS ${FFTW_ROOT}
         PATH_SUFFIXES "lib" "lib64"
         NO_DEFAULT_PATH
@@ -62,18 +81,18 @@ IF(FFTW_ROOT)
         FFTW_INCLUDES
         NAMES "fftw3.h"
         PATHS ${FFTW_ROOT}
-        PATH_SUFFIXES "include"
+        PATH_SUFFIXES "include" "include/fftw"
         NO_DEFAULT_PATH
         )
 ELSE()
     FIND_LIBRARY(
         FFTW_LIB
-        NAMES "fftw3"
+        NAMES "fftw3" "mkl_rt"
         PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR}
         )
     FIND_LIBRARY(
         FFTWF_LIB
-        NAMES "fftw3f"
+        NAMES "fftw3f" "mkl_rt"
         PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR}
         )
     FIND_PATH(
diff --git a/CMakeModules/FindGLEWmx.cmake b/CMakeModules/FindGLEWmx.cmake
index b90919eb98..a6da72bbf2 100644
--- a/CMakeModules/FindGLEWmx.cmake
+++ b/CMakeModules/FindGLEWmx.cmake
@@ -55,7 +55,6 @@ ELSE (WIN32)
         /sw/lib
         /opt/local/lib
         ${GLEW_ROOT_DIR}/lib
-        NO_DEFAULT_PATH
         DOC "The GLEWmx library")
 
     SET(PX ${CMAKE_STATIC_LIBRARY_PREFIX})
@@ -72,7 +71,6 @@ ELSE (WIN32)
         /sw/lib
         /opt/local/lib
         ${GLEW_ROOT_DIR}/lib
-        NO_DEFAULT_PATH
         DOC "The GLEWmx library")
     UNSET(PX)
     UNSET(SX)
diff --git a/CMakeModules/FindLAPACKE.cmake b/CMakeModules/FindLAPACKE.cmake
index 3bf8a1f362..0732cfaa83 100644
--- a/CMakeModules/FindLAPACKE.cmake
+++ b/CMakeModules/FindLAPACKE.cmake
@@ -9,15 +9,33 @@
 #   LAPACK_INCLUDES            ... LAPACKE include directory
 #
 
-IF(NOT LAPACKE_ROOT AND ENV{LAPACKEDIR})
-  SET(LAPACKE_ROOT $ENV{LAPACKEDIR})
+SET(LAPACKE_ROOT_DIR CACHE STRING
+  "Root directory for custom LAPACK implementation")
+
+IF (NOT INTEL_MKL_ROOT_DIR)
+  SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT})
+ENDIF()
+
+IF(NOT LAPACKE_ROOT_DIR)
+
+  IF (ENV{LAPACKEDIR})
+    SET(LAPACKE_ROOT_DIR $ENV{LAPACKEDIR})
+  ENDIF()
+
+  IF (ENV{LAPACKE_ROOT_DIR_DIR})
+    SET(LAPACKE_ROOT_DIR $ENV{LAPACKE_ROOT_DIR})
+  ENDIF()
+
+  IF (INTEL_MKL_ROOT_DIR)
+    SET(LAPACKE_ROOT_DIR ${INTEL_MKL_ROOT_DIR})
+  ENDIF()
 ENDIF()
 
 # Check if we can use PkgConfig
 FIND_PACKAGE(PkgConfig)
 
 #Determine from PKG
-IF(PKG_CONFIG_FOUND AND NOT LAPACKE_ROOT)
+IF(PKG_CONFIG_FOUND AND NOT LAPACKE_ROOT_DIR)
   PKG_CHECK_MODULES( PC_LAPACKE QUIET "lapacke")
 ENDIF()
 
@@ -48,40 +66,41 @@ IF(PC_LAPACKE_FOUND)
 
 ELSE(PC_LAPACKE_FOUND)
 
-    IF(LAPACKE_ROOT)
+    IF(LAPACKE_ROOT_DIR)
         #find libs
         FIND_LIBRARY(
             LAPACKE_LIB
-            NAMES "lapacke" "LAPACKE" "liblapacke"
-            PATHS ${LAPACKE_ROOT}
-            PATH_SUFFIXES "lib" "lib64"
+            NAMES "lapacke" "LAPACKE" "liblapacke" "mkl_rt"
+            PATHS ${LAPACKE_ROOT_DIR}
+            PATH_SUFFIXES "lib" "lib64" "lib/ia32" "lib/intel64"
             DOC "LAPACKE Library"
             NO_DEFAULT_PATH
             )
         FIND_LIBRARY(
             LAPACK_LIB
-            NAMES "lapack" "LAPACK" "liblapack"
-            PATHS ${LAPACKE_ROOT}
-            PATH_SUFFIXES "lib" "lib64"
+            NAMES "lapack" "LAPACK" "liblapack" "mkl_rt"
+            PATHS ${LAPACKE_ROOT_DIR}
+            PATH_SUFFIXES "lib" "lib64" "lib/ia32" "lib/intel64"
             DOC "LAPACK Library"
             NO_DEFAULT_PATH
             )
         FIND_PATH(
             LAPACKE_INCLUDES
-            NAMES "lapacke.h"
-            PATHS ${LAPACKE_ROOT}
+            NAMES "lapacke.h" "mkl_lapacke.h"
+            PATHS ${LAPACKE_ROOT_DIR}
             PATH_SUFFIXES "include"
             DOC "LAPACKE Include Directory"
             NO_DEFAULT_PATH
             )
-
     ELSE()
         FIND_LIBRARY(
             LAPACKE_LIB
-            NAMES "lapacke" "liblapacke" "openblas"
+            NAMES "lapacke" "liblapacke" "openblas" "mkl_rt"
             PATHS
             ${PC_LAPACKE_LIBRARY_DIRS}
             ${LIB_INSTALL_DIR}
+            /opt/intel/mkl/lib/ia32
+            /opt/intel/mkl/lib/intel64
             /usr/lib64
             /usr/lib
             /usr/local/lib64
@@ -92,10 +111,12 @@ ELSE(PC_LAPACKE_FOUND)
             )
         FIND_LIBRARY(
            LAPACK_LIB
-            NAMES "lapack" "liblapack" "openblas"
+            NAMES "lapack" "liblapack" "openblas" "mkl_rt"
             PATHS
             ${PC_LAPACKE_LIBRARY_DIRS}
             ${LIB_INSTALL_DIR}
+            /opt/intel/mkl/lib/ia32
+            /opt/intel/mkl/lib/intel64
             /usr/lib64
             /usr/lib
             /usr/local/lib64
@@ -106,21 +127,26 @@ ELSE(PC_LAPACKE_FOUND)
             )
         FIND_PATH(
             LAPACKE_INCLUDES
-            NAMES "lapacke.h"
+            NAMES "lapacke.h" "mkl_lapacke.h"
             PATHS
             ${PC_LAPACKE_INCLUDE_DIRS}
             ${INCLUDE_INSTALL_DIR}
+            /opt/intel/mkl/include
             /usr/include
             /usr/local/include
             /sw/include
             /opt/local/include
             DOC "LAPACKE Include Directory"
             )
-    ENDIF(LAPACKE_ROOT)
+    ENDIF(LAPACKE_ROOT_DIR)
 ENDIF(PC_LAPACKE_FOUND)
 
-SET(LAPACK_LIBRARIES ${LAPACKE_LIB} ${LAPACK_LIB})
-SET(LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDES})
+IF(LAPACKE_LIB AND LAPACK_LIB)
+    SET(LAPACK_LIBRARIES ${LAPACKE_LIB} ${LAPACK_LIB})
+ENDIF()
+IF(LAPACKE_INCLUDES)
+    SET(LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDES})
+ENDIF()
 
 INCLUDE(FindPackageHandleStandardArgs)
 FIND_PACKAGE_HANDLE_STANDARD_ARGS(LAPACK DEFAULT_MSG
diff --git a/CMakeModules/Version.cmake b/CMakeModules/Version.cmake
index cd5149bd25..8d5b575399 100644
--- a/CMakeModules/Version.cmake
+++ b/CMakeModules/Version.cmake
@@ -2,8 +2,8 @@
 # Make a version file that includes the ArrayFire version and git revision
 #
 SET(AF_VERSION_MAJOR "3")
-SET(AF_VERSION_MINOR "2")
-SET(AF_VERSION_PATCH "2")
+SET(AF_VERSION_MINOR "3")
+SET(AF_VERSION_PATCH "0")
 
 SET(AF_VERSION "${AF_VERSION_MAJOR}.${AF_VERSION_MINOR}.${AF_VERSION_PATCH}")
 SET(AF_API_VERSION_CURRENT ${AF_VERSION_MAJOR}${AF_VERSION_MINOR})
@@ -32,6 +32,11 @@ EXECUTE_PROCESS(
     OUTPUT_STRIP_TRAILING_WHITESPACE
 )
 
+IF(NOT GIT_COMMIT_HASH)
+    MESSAGE(STATUS "No git. Setting hash to default")
+    SET(GIT_COMMIT_HASH "default")
+ENDIF()
+
 CONFIGURE_FILE(
     ${CMAKE_MODULE_PATH}/version.h.in
     ${CMAKE_SOURCE_DIR}/include/af/version.h
diff --git a/CMakeModules/build_boost_compute.cmake b/CMakeModules/build_boost_compute.cmake
index c0de1cb291..03c20435a8 100644
--- a/CMakeModules/build_boost_compute.cmake
+++ b/CMakeModules/build_boost_compute.cmake
@@ -1,6 +1,9 @@
-SET(VER 79aa8f9086fdf6ef6db78e889de0273b0eb7bd19)
-SET(URL https://github.com/boostorg/compute/archive/${VER}.tar.gz)
-SET(MD5 dba3318cbdac912dddce71f2a38ffa43)
+# If using a commit, remove the v prefix to VER in URL.
+# If using a tag, don't use v in VER
+# This is because of how github handles it's release tar balls
+SET(VER 0.5)
+SET(URL https://github.com/boostorg/compute/archive/v${VER}.tar.gz)
+SET(MD5 69a52598ac539d3b7f6005a3dd2b6f58)
 
 SET(thirdPartyDir "${CMAKE_BINARY_DIR}/third_party")
 SET(srcDir "${thirdPartyDir}/compute-${VER}")
diff --git a/CMakeModules/build_clBLAS.cmake b/CMakeModules/build_clBLAS.cmake
index 6cb1ae8aaf..2289c26393 100644
--- a/CMakeModules/build_clBLAS.cmake
+++ b/CMakeModules/build_clBLAS.cmake
@@ -14,7 +14,7 @@ ENDIF()
 ExternalProject_Add(
     clBLAS-ext
     GIT_REPOSITORY https://github.com/arrayfire/clBLAS.git
-    GIT_TAG 102c832825e8e4d60ad73ca97e95668463294068
+    GIT_TAG af3.3.0
     PREFIX "${prefix}"
     INSTALL_DIR "${prefix}"
     UPDATE_COMMAND ""
diff --git a/CMakeModules/build_clFFT.cmake b/CMakeModules/build_clFFT.cmake
index e1dbb3fe1c..2ab9ccc1ea 100644
--- a/CMakeModules/build_clFFT.cmake
+++ b/CMakeModules/build_clFFT.cmake
@@ -14,7 +14,7 @@ ENDIF()
 ExternalProject_Add(
     clFFT-ext
     GIT_REPOSITORY https://github.com/arrayfire/clFFT.git
-    GIT_TAG 1597f0f35a644789c7ad77efe79014236cca2fab
+    GIT_TAG af3.3.0
     PREFIX "${prefix}"
     INSTALL_DIR "${prefix}"
     UPDATE_COMMAND ""
diff --git a/CMakeModules/osx_install/OSXInstaller.cmake b/CMakeModules/osx_install/OSXInstaller.cmake
index dc3a8b2491..b2514f8e2a 100644
--- a/CMakeModules/osx_install/OSXInstaller.cmake
+++ b/CMakeModules/osx_install/OSXInstaller.cmake
@@ -8,8 +8,75 @@ SET(BIN2CPP_PROGRAM "bin2cpp")
 
 SET(OSX_INSTALL_DIR ${CMAKE_MODULE_PATH}/osx_install)
 
+################################################################################
+## Create Directory Structure
+################################################################################
+SET(OSX_TEMP "${CMAKE_BINARY_DIR}/osx_install_files")
+
+# Common files - libforge, ArrayFireConfig*.cmake
+FILE(GLOB COMMONLIB "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_LIB_DIR}/libforge*.dylib")
+FILE(GLOB COMMONCMAKE "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_CMAKE_DIR}/ArrayFireConfig*.cmake")
+
+ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_COMMON)
+FOREACH(SRC ${COMMONLIB} ${COMMONCMAKE})
+    FILE(RELATIVE_PATH SRC_REL ${CMAKE_INSTALL_PREFIX} ${SRC})
+    ADD_CUSTOM_COMMAND(TARGET OSX_INSTALL_SETUP_COMMON PRE_BUILD
+                       COMMAND ${CMAKE_COMMAND} -E copy
+                       ${SRC} "${OSX_TEMP}/common/${SRC_REL}"
+                       WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                       COMMENT "Copying Common files to temporary OSX Install Dir"
+                       )
+ENDFOREACH()
+
+# Backends - CPU, CUDA, OpenCL, Unified
+MACRO(OSX_INSTALL_SETUP BACKEND LIB)
+    FILE(GLOB ${BACKEND}LIB "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_LIB_DIR}/lib${LIB}*.dylib")
+    FILE(GLOB ${BACKEND}CMAKE "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_CMAKE_DIR}/ArrayFire${BACKEND}*.cmake")
+
+    ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_${BACKEND})
+    FOREACH(SRC ${${BACKEND}LIB} ${${BACKEND}CMAKE})
+        FILE(RELATIVE_PATH SRC_REL ${CMAKE_INSTALL_PREFIX} ${SRC})
+        ADD_CUSTOM_COMMAND(TARGET OSX_INSTALL_SETUP_${BACKEND} PRE_BUILD
+                           COMMAND ${CMAKE_COMMAND} -E copy
+                           ${SRC} "${OSX_TEMP}/${BACKEND}/${SRC_REL}"
+                           WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                           COMMENT "Copying ${BACKEND} files to temporary OSX Install Dir"
+                           )
+    ENDFOREACH()
+ENDMACRO(OSX_INSTALL_SETUP)
+
+OSX_INSTALL_SETUP(CPU afcpu)
+OSX_INSTALL_SETUP(CUDA afcuda)
+OSX_INSTALL_SETUP(OpenCL afopencl)
+OSX_INSTALL_SETUP(Unified af)
+
+# Headers
+ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_INCLUDE
+                  COMMAND ${CMAKE_COMMAND} -E copy_directory
+                  ${CMAKE_INSTALL_PREFIX}/include "${OSX_TEMP}/include"
+                  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                  COMMENT "Copying header files to temporary OSX Install Dir"
+                  )
+
+# Examples
+ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_EXAMPLES
+                  COMMAND ${CMAKE_COMMAND} -E copy_directory
+                  "${CMAKE_INSTALL_PREFIX}/share/ArrayFire/examples" "${OSX_TEMP}/examples"
+                  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                  COMMENT "Copying examples files to temporary OSX Install Dir"
+                  )
+
+# Documentation
+ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_DOC
+                  COMMAND ${CMAKE_COMMAND} -E copy_directory
+                  "${CMAKE_INSTALL_PREFIX}/share/ArrayFire/doc" "${OSX_TEMP}/doc"
+                  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                  COMMENT "Copying documentation files to temporary OSX Install Dir"
+                  )
+################################################################################
+
 FUNCTION(PKG_BUILD)
-    CMAKE_PARSE_ARGUMENTS(ARGS "" "INSTALL_LOCATION;IDENTIFIER;PATH_TO_FILES;PKG_NAME;TARGETS;SCRIPT_DIR" "FILTERS" ${ARGN})
+    CMAKE_PARSE_ARGUMENTS(ARGS "" "DEPENDS;INSTALL_LOCATION;IDENTIFIER;PATH_TO_FILES;PKG_NAME;TARGETS;SCRIPT_DIR" "FILTERS" ${ARGN})
 
     FOREACH(filter ${ARGS_FILTERS})
         LIST(APPEND  FILTER_LIST --filter ${filter})
@@ -70,50 +137,69 @@ ENDFUNCTION(PRODUCT_BUILD)
 
 
 PKG_BUILD(  PKG_NAME        ArrayFireCPU
-            DEPENDS         afcpu
+            DEPENDS         OSX_INSTALL_SETUP_CPU
             TARGETS         cpu_package
-            INSTALL_LOCATION /usr/local/lib
+            INSTALL_LOCATION /usr/local
             SCRIPT_DIR      ${OSX_INSTALL_DIR}/cpu_scripts
             IDENTIFIER      com.arrayfire.pkg.arrayfire.cpu.lib
-            PATH_TO_FILES   package/lib
+            PATH_TO_FILES   ${OSX_TEMP}/CPU
             FILTERS         opencl cuda unified)
 
 PKG_BUILD(  PKG_NAME        ArrayFireCUDA
-            DEPENDS         afcuda
+            DEPENDS         OSX_INSTALL_SETUP_CUDA
             TARGETS         cuda_package
-            INSTALL_LOCATION /usr/local/lib
+            INSTALL_LOCATION /usr/local
             SCRIPT_DIR      ${OSX_INSTALL_DIR}/cuda_scripts
             IDENTIFIER      com.arrayfire.pkg.arrayfire.cuda.lib
-            PATH_TO_FILES   package/lib
+            PATH_TO_FILES   ${OSX_TEMP}/CUDA
             FILTERS         cpu opencl unified)
 
 PKG_BUILD(  PKG_NAME        ArrayFireOPENCL
-            DEPENDS         afopencl
+            DEPENDS         OSX_INSTALL_SETUP_OpenCL
             TARGETS         opencl_package
-            INSTALL_LOCATION /usr/local/lib
+            INSTALL_LOCATION /usr/local
             IDENTIFIER      com.arrayfire.pkg.arrayfire.opencl.lib
-            PATH_TO_FILES   package/lib
+            PATH_TO_FILES   ${OSX_TEMP}/OpenCL
             FILTERS         cpu cuda unified)
 
 PKG_BUILD(  PKG_NAME        ArrayFireUNIFIED
-            DEPENDS         af
+            DEPENDS         OSX_INSTALL_SETUP_Unified
             TARGETS         unified_package
-            INSTALL_LOCATION /usr/local/lib
+            INSTALL_LOCATION /usr/local
             IDENTIFIER      com.arrayfire.pkg.arrayfire.unified.lib
-            PATH_TO_FILES   package/lib
+            PATH_TO_FILES   ${OSX_TEMP}/Unified
             FILTERS         cpu cuda opencl)
 
+PKG_BUILD(  PKG_NAME        ArrayFireCommon
+            DEPENDS         OSX_INSTALL_SETUP_COMMON
+            TARGETS         common_package
+            INSTALL_LOCATION /usr/local
+            IDENTIFIER      com.arrayfire.pkg.arrayfire.libcommon
+            PATH_TO_FILES   ${OSX_TEMP}/common
+            FILTERS         cpu cuda opencl unified)
+
 PKG_BUILD(  PKG_NAME        ArrayFireHeaders
+            DEPENDS         OSX_INSTALL_SETUP_INCLUDE
             TARGETS         header_package
             INSTALL_LOCATION /usr/local/include
             IDENTIFIER      com.arrayfire.pkg.arrayfire.inc
-            PATH_TO_FILES   package/include)
-
-PKG_BUILD(  PKG_NAME        ArrayFireExtra
-            TARGETS         extra_package
-            INSTALL_LOCATION /usr/local/share
-            IDENTIFIER      com.arrayfire.pkg.arrayfire.extra
-            PATH_TO_FILES   package/share)
-
-PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${header_package} ${extra_package})
+            PATH_TO_FILES   ${OSX_TEMP}/include)
+
+PKG_BUILD(  PKG_NAME        ArrayFireExamples
+            DEPENDS         OSX_INSTALL_SETUP_EXAMPLES
+            TARGETS         examples_package
+            INSTALL_LOCATION /usr/local/share/ArrayFire/examples
+            IDENTIFIER      com.arrayfire.pkg.arrayfire.examples
+            PATH_TO_FILES   ${OSX_TEMP}/examples
+            FILTERS         cmake)
+
+PKG_BUILD(  PKG_NAME        ArrayFireDoc
+            DEPENDS         OSX_INSTALL_SETUP_DOC
+            TARGETS         doc_package
+            INSTALL_LOCATION /usr/local/share/ArrayFire/doc
+            IDENTIFIER      com.arrayfire.pkg.arrayfire.doc
+            PATH_TO_FILES   ${OSX_TEMP}/doc
+            FILTERS         cmake)
+
+PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${common_package} ${header_package} ${examples_package} ${doc_package})
 
diff --git a/CMakeModules/osx_install/distribution.dist b/CMakeModules/osx_install/distribution.dist
index 3dc82379c9..b476bf013f 100644
--- a/CMakeModules/osx_install/distribution.dist
+++ b/CMakeModules/osx_install/distribution.dist
@@ -17,7 +17,9 @@
     <pkg-ref id="com.arrayfire.arrayfire.opencl.lib"    version="${AF_VERSION}" onConclusion="none">ArrayFireOPENCL.pkg</pkg-ref>
     <pkg-ref id="com.arrayfire.arrayfire.unified.lib"   version="${AF_VERSION}" onConclusion="none">ArrayFireUNIFIED.pkg</pkg-ref>
     <pkg-ref id="com.arrayfire.arrayfire.inc"           version="${AF_VERSION}" onConclusion="none">ArrayFireHeaders.pkg</pkg-ref>
-    <pkg-ref id="com.arrayfire.arrayfire.extra"         version="${AF_VERSION}" onConclusion="none">ArrayFireExtra.pkg</pkg-ref>
+    <pkg-ref id="com.arrayfire.arrayfire.examples"      version="${AF_VERSION}" onConclusion="none">ArrayFireExamples.pkg</pkg-ref>
+    <pkg-ref id="com.arrayfire.arrayfire.doc"           version="${AF_VERSION}" onConclusion="none">ArrayFireDoc.pkg</pkg-ref>
+    <pkg-ref id="com.arrayfire.arrayfire.libcommon"     version="${AF_VERSION}" onConclusion="none">ArrayFireCommon.pkg</pkg-ref>
     <options customize="always" require-scripts="false"/>
     <choices-outline>
         <line choice="libs">
@@ -25,38 +27,51 @@
             <line choice="cuda_lib"/>
             <line choice="opencl_lib"/>
             <line choice="com.arrayfire.arrayfire.unified.lib"/>
+            <line choice="com.arrayfire.arrayfire.libcommon"/>
         </line>
         <line choice="com.arrayfire.arrayfire.inc"/>
-        <line choice="com.arrayfire.arrayfire.extra"/>
+        <line choice="com.arrayfire.arrayfire.examples"/>
+        <line choice="com.arrayfire.arrayfire.doc"/>
     </choices-outline>
     <choice id="libs" title="ArrayFire Libraries" visible="true" />
     <choice title="CPU Libraries"
-            description="ArrayFire targeting CPUs."
+            description="ArrayFire targeting CPUs. Also installs the corresponding CMake config files."
             id="cpu_lib" visible="true" enabled="true">
         <pkg-ref id="com.arrayfire.arrayfire.cpu.lib"/>
     </choice>
     <choice title="CUDA Libraries"
-            description="ArrayFire which targets the CUDA platform. This platform allows you to to take advantage of the CUDA enabled GPUs to run ArrayFire code."
+            description="ArrayFire which targets the CUDA platform. This platform allows you to to take advantage of the CUDA enabled GPUs to run ArrayFire code. Also installs the corresponding CMake config files."
             id="cuda_lib" visible="true" enabled="true">
         <pkg-ref id="com.arrayfire.arrayfire.cuda.lib"/>
     </choice>
     <choice title="OpenCL Libraries"
-            description="ArrayFire which targets the OpenCL platform. This platform allows you to use the ArrayFire library which targets OpenCL devices. NOTE: Currently ArrayFire does not support OpenCL for the Intel CPU on Apple."
+            description="ArrayFire which targets the OpenCL platform. This platform allows you to use the ArrayFire library which targets OpenCL devices. Also installs the corresponding CMake config files. NOTE: Currently ArrayFire does not support OpenCL for the Intel CPU on Apple."
             id="opencl_lib" visible="true" enabled="true">
         <pkg-ref id="com.arrayfire.arrayfire.opencl.lib"/>
     </choice>
     <choice title="Unified Library"
-            description="This library will allow you to choose the platform(cpu, cuda, opencl) at runtime. NOTE: This option requires the other platforms to work properly"
+            description="This library will allow you to choose the platform(cpu, cuda, opencl) at runtime. Also installs the corresponding CMake config files. NOTE: This option requires the other platforms to work properly"
             id="com.arrayfire.arrayfire.unified.lib"
             selected="CheckBackendSelected()"
             visible="true"
             enabled="CheckBackendSelected()">
         <pkg-ref id="com.arrayfire.arrayfire.unified.lib"/>
     </choice>
+    <choice title="Library Common"
+            description="Installs Forge and ArrayFireConfig.cmake files"
+            id="com.arrayfire.arrayfire.libcommon"
+            selected="CheckBackendSelected()"
+            visible="false"
+            enabled="CheckBackendSelected()">
+        <pkg-ref id="com.arrayfire.arrayfire.libcommon"/>
+    </choice>
     <choice title="ArrayFire Headers" description="ArrayFire Headers" id="com.arrayfire.arrayfire.inc" visible="true" enabled="true">
         <pkg-ref id="com.arrayfire.arrayfire.inc"/>
     </choice>
-    <choice title="Extras" description="Extra files include documentation, examples, and cmake scripts to find and use ArrayFire with cmake," id="com.arrayfire.arrayfire.extra" visible="true" enabled="true">
-        <pkg-ref id="com.arrayfire.arrayfire.extra"/>
+    <choice title="Examples" description="ArrayFire Examples" id="com.arrayfire.arrayfire.examples" visible="true" enabled="true">
+        <pkg-ref id="com.arrayfire.arrayfire.examples"/>
+    </choice>
+    <choice title="Documentation" description="ArrayFire Documentation" id="com.arrayfire.arrayfire.doc" visible="true" enabled="true">
+        <pkg-ref id="com.arrayfire.arrayfire.doc"/>
     </choice>
 </installer-gui-script>
diff --git a/CMakeModules/osx_install/readme.html b/CMakeModules/osx_install/readme.html
index 41d4ab8cf0..482b7add7e 100644
--- a/CMakeModules/osx_install/readme.html
+++ b/CMakeModules/osx_install/readme.html
@@ -5,18 +5,9 @@ <h2>Install Directories</h2>
     <ul>
         <li> Libraries will be installed in <code>/usr/local/lib</code> </li>
         <li> Headers will be installed in <code>/usr/local/include</code> </li>
-        <li> Docs and other files will be installed in <code>/usr/local/share</code> </li>
-    </ul>
-
-    <h2> Major Updates </h2>
-    <ul>
-        <li> ArrayFire is now open source</li>
-        <li> Major changes to the visualization library</li>
-        <li> Introducing handle based C API</li>
-        <li> New backend: CPU fallback available for systems without GPUs</li>
-        <li> Dense linear algebra functions available for all backends</li>
-        <li> Support for 64 bit integers</li>
+        <li> Examples, documentation and CMake config files will be installed in <code>/usr/local/share</code> </li>
     </ul>
+    <p> For complete list of updates, visit <a href="http://www.arrayfire.com/docs/releasenotes.htm">ArrayFire Release Notes</a></p>
 
 </body>
 </html>
diff --git a/README.md b/README.md
index 695adbed03..f43b9fd098 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,8 @@ ArrayFire binary installers can be downloaded at the [ArrayFire Downloads](http:
 ### Build Status
 |         | Linux x86 | Linux armv7l | Linux aarch64 | Windows | OSX |
 |:-------:|:---------:|:------------:|:-------------:|:-------:|:---:|
-| Build   | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/devel)](http://ci.arrayfire.org/job/arrayfire-linux/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/devel)](http://ci.arrayfire.org/job/arrayfire-windows/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/devel)](http://ci.arrayfire.org/job/arrayfire-osx/branch/devel/) |
-| Test    | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux-test/devel)](http://ci.arrayfire.org/job/arrayfire-linux-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows-test/devel)](http://ci.arrayfire.org/job/arrayfire-windows-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx-test/devel)](http://ci.arrayfire.org/job/arrayfire-osx-test/branch/devel/) |
+| Build   | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/build/devel)](http://ci.arrayfire.org/job/arrayfire-linux/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/build/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/build/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/build/devel)](http://ci.arrayfire.org/job/arrayfire-windows/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/build/devel)](http://ci.arrayfire.org/job/arrayfire-osx/job/build/branch/devel/) |
+| Test    | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/test/devel)](http://ci.arrayfire.org/job/arrayfire-linux/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/test/devel)](http://ci.arrayfire.org/job/arrayfire-windows/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/test/devel)](http://ci.arrayfire.org/job/arrayfire-osx/job/test/branch/devel/) |
 
 Test coverage: [![Coverage Status](https://coveralls.io/repos/arrayfire/arrayfire/badge.svg?branch=HEAD)](https://coveralls.io/r/arrayfire/arrayfire?branch=HEAD)
 
diff --git a/assets b/assets
index 8030a5c626..f16f8bf74f 160000
--- a/assets
+++ b/assets
@@ -1 +1 @@
-Subproject commit 8030a5c626777a5b3f46b319dd4d1723eca4b0f9
+Subproject commit f16f8bf74fe4a255db05884cfff8f5cb0e6e8e09
diff --git a/docs/arrayfire.css b/docs/arrayfire.css
index 75dba64e3a..e4fe2860be 100644
--- a/docs/arrayfire.css
+++ b/docs/arrayfire.css
@@ -52,12 +52,6 @@ a.codeRef, a.codeRef:visited, a.lineRef, a.lineRef:visited
     color       :   #4665A2;
 }
 
-@font-face
-{
-    font-family :   prototype;
-    src         :   url('Prototype.ttf');
-}
-
 /*image and image groups*/
 div.image_group
 {
@@ -96,7 +90,6 @@ div.support *
 
 #under_logo
 {
-    font-family :   prototype;
     font-size   :   2em;
     max-width   :   25px;
     color       :   #000000;
@@ -104,7 +97,6 @@ div.support *
 
 #projectbrief
 {
-    font-family :   prototype;
     color       :   #555555
 }
 
@@ -121,7 +113,6 @@ div.support *
 
 #projectname
 {
-    font-family     :   prototype;
     font-size       :   3em;
     max-width       :   25px;
     color           :   #555555
diff --git a/docs/details/arith.dox b/docs/details/arith.dox
index 50f82aafed..a75c3a2cc4 100644
--- a/docs/details/arith.dox
+++ b/docs/details/arith.dox
@@ -448,8 +448,6 @@ Raise an array to a power
 
 Exponential of input
 
-\copydoc arith_real_only
-
 
 \defgroup arith_func_expm1 expm1
 
diff --git a/docs/details/backend.dox b/docs/details/backend.dox
index 4d9cdf6f53..893567b696 100644
--- a/docs/details/backend.dox
+++ b/docs/details/backend.dox
@@ -71,5 +71,23 @@ The return value specifies which backend the array was created on.
 
 =======================================================================
 
+\defgroup unified_func_getactivebackend getActiveBackend
+
+\brief Get's the backend enum for the active backend
+
+\ingroup unified_func
+\ingroup arrayfire_func
+
+=======================================================================
+
+\defgroup unified_func_getdeviceid getDeviceId
+
+\brief Get's the id of the device an array was created on.
+
+\ingroup unified_func
+\ingroup arrayfire_func
+
+=======================================================================
+
 @}
 */
diff --git a/docs/details/device.dox b/docs/details/device.dox
index 230199d583..1aa43e7465 100644
--- a/docs/details/device.dox
+++ b/docs/details/device.dox
@@ -2,6 +2,22 @@
 \addtogroup arrayfire_func
 @{
 
+\defgroup device_func_prop deviceInfo
+\ingroup device_mat
+
+\brief Gets the information about device and platform as strings
+
+\param d_name pointer to a user-allocated char array. Recommended minimum size is 64.
+The name of the device is stored in this array.
+\param d_platform pointer to a user-allocated char array. Recommended minimum size is 10.
+The platform information is stored in this array.
+\param d_toolkit pointer to a user-allocated char array. Recommended minimum size is 64.
+The toolkit information is stored in this array.
+\param d_compute pointer to a user-allocated char array. Recommended minimum size is 10.
+The compute version of the device is stored in this array.
+
+===============================================================================
+
 \defgroup device_func_count getDeviceCount
 \ingroup device_mat
 
@@ -62,6 +78,16 @@ allocation
 
 ===============================================================================
 
+\defgroup device_func_free free
+\ingroup device_mat
+
+\brief Free device memory allocated by ArrayFire's memory manager
+
+These calls free the device memory. These functions need to be called on
+pointers allocated using alloc function.
+
+===============================================================================
+
 \defgroup device_func_pinned pinned
 \ingroup device_mat
 
@@ -73,12 +99,39 @@ a limited resource.
 
 ===============================================================================
 
-\defgroup device_func_free free
+\defgroup device_func_free_pinned freePinned
 \ingroup device_mat
 
-\brief Free device memory allocated by ArrayFire's memory manager
+\brief Free pinned memory allocated by ArrayFire's memory manager
+
+These calls free the pinned memory on host. These functions need to be called on
+pointers allocated using pinned function.
+
+===============================================================================
+
+\defgroup device_func_alloc_host allocHost
+\ingroup device_mat
+
+\brief Allocate memory on host
+
+This function is used for allocating regular memory on host. This is useful
+where the compiler version of ArrayFire library is different from the
+executable's compiler version.
+
+It does not use ArrayFire's memory manager.
+
+===============================================================================
+
+\defgroup device_func_free_host freeHost
+\ingroup device_mat
+
+\brief Free memory allocated on host internally by ArrayFire
+
+This function is used for freeing memory on host that was allocated within
+ArrayFire. This is useful where the compiler version of ArrayFire library is
+different from the executable's compiler version.
 
-These calls free the device or pinned memory. These functions need to be called
+It does not use ArrayFire's memory manager.
 
 ===============================================================================
 
diff --git a/docs/details/image.dox b/docs/details/image.dox
index 234f4f72e9..ef6d12a4f0 100644
--- a/docs/details/image.dox
+++ b/docs/details/image.dox
@@ -430,6 +430,12 @@ Save an array to disk as an image
 Supported formats include JPG, PNG, PPM and other formats supported by freeimage
 
 
+\defgroup imageio_func_available isImageIoAvailable
+\ingroup imageio_mat
+
+Returns true if ArrayFire was compiled with ImageIO (FreeImage) support
+
+
 \defgroup imagemem_func_load loadImageMem
 \ingroup imageio_mat
 
@@ -501,10 +507,12 @@ grad(dx, dy, in);
 
 Resize an input image
 
-Resizing an input image can be done using either \ref AF_INTERP_NEAREST or
-\ref AF_INTERP_BILINEAR interpolations. Nearest interpolation will pick the
-nearest value to the location, whereas bilinear interpolation will do a
-weighted interpolation for calculate the new size.
+Resizing an input image can be done using either \ref AF_INTERP_NEAREST,
+\ref AF_INTERP_BILINEAR or \ref AF_INTERP_LOWER, interpolations. Nearest
+interpolation will pick the nearest value to the location, bilinear
+interpolation will do a weighted interpolation for calculate the new size
+and lower interpolation is similar to the nearest, except it will use the
+floor function to get the lower neighbor.
 
 This function does not differentiate between images and data. As long as
 the array is defined and the output dimensions are not 0, it will resize any
@@ -556,10 +564,10 @@ Rotate an input image
 
 The angle theta is in radians.
 
-Rotating an input image can be done using either \ref AF_INTERP_NEAREST or
-\ref AF_INTERP_BILINEAR interpolations. Nearest interpolation will pick the
-nearest value to the location, whereas bilinear interpolation will do a
-weighted interpolation for calculate the new size.
+Rotating an input image can be done using \ref AF_INTERP_NEAREST,
+\ref AF_INTERP_BILINEAR or \ref AF_INTERP_LOWER interpolations. Nearest
+interpolation will pick the nearest value to the location, whereas bilinear
+interpolation will do a weighted interpolation for calculate the new size.
 
 This function does not differentiate between images and data. As long as
 the array is defined, it will rotate any type or size of array.
@@ -659,26 +667,51 @@ Skew is a special case of the \ref af::transform function.
 
 Transform an input image
 
-The transform function uses an affine transform matrix to tranform an input
+The transform function uses an affine or perspective transform matrix to tranform an input
 image into a new one.
 
-The transform matrix \p tf is a 3x2 matrix of type float. The matrix operation
-is applied to each location (x, y) that is then transformed to (x', y') of the
+If matrix \p tf is is a 3x2 matrix, an affine transformation will be performed. The matrix
+operation is applied to each location (x, y) that is then transformed to (x', y') of the
 new array. Hence the transformation is an element-wise operation.
 
-The operation is as below:
-tf = [r00 r10
-      r01 r11
+The operation is as below:\n
+tf = [r00 r10\n
+      r01 r11\n
       t0  t1]
 
-x' = x * r00 + y * r01 + t0;
+x' = x * r00 + y * r01 + t0;\n
 y' = x * r10 + y * r11 + t1;
 
-Interpolation types of \ref AF_INTERP_NEAREST and \ref AF_INTERP_BILINEAR are allowed.
+If matrix \p tf is is a 3x3 matrix, a perspective transformation will be performed.
+
+The operation is as below:\n
+tf = [r00 r10 r20\n
+      r01 r11 r21\n
+      t0  t1  t2]
+
+x' = (x * r00 + y * r01 + t0) / (x * r20 + y * r21 + t2);\n
+y' = (x * r10 + y * r11 + t1) / (x * r20 + y * r21 + t2);
+
+The transformation matrix \p tf should always be of type f32.
+
+Interpolation types of \ref AF_INTERP_NEAREST, \ref AF_INTERP_BILINEAR and
+AF_INTERP_LOWER are allowed.
 
 Affine transforms can be used for various purposes. \ref af::translate, \ref af::scale and \ref af::skew
 are specializations of the transform function.
 
+
+\defgroup transform_func_coordinates transformcoordinates
+\ingroup transform_mat
+
+Transform input coordinates
+
+The transform function uses a perspective transform matrix to transform input
+coordinates (given as two dimensions) into a coordinates matrix.
+
+The output is a 4x2 matrix, indicating the coordinates of the 4 bidimensional
+transformed points.
+
 =======================================================================
 
 \defgroup image_func_sat SAT
diff --git a/docs/details/internal.dox b/docs/details/internal.dox
new file mode 100644
index 0000000000..5ac06422ca
--- /dev/null
+++ b/docs/details/internal.dox
@@ -0,0 +1,29 @@
+/**
+\addtogroup internal_func
+@{
+
+\defgroup internal_func_create createStridedArray
+
+Create an array with specified strides and offset.
+
+
+\defgroup internal_func_strides getStrides
+
+Get strides of underlying data.
+
+
+\defgroup internal_func_offset getOffset
+
+Get Offset of the underlying data.
+
+
+\defgroup internal_func_linear isLinear
+
+Check if all elements in array are contiguous.
+
+\defgroup internal_func_owner isOwner
+
+Check if underlying data is owned by the current array.
+
+@}
+*/
diff --git a/docs/details/lapack.dox b/docs/details/lapack.dox
index c0d8aae5b9..522dbe544f 100644
--- a/docs/details/lapack.dox
+++ b/docs/details/lapack.dox
@@ -287,5 +287,13 @@ This function can return the norm using various metrics based on the type paramt
 
 ===============================================================================
 
+\defgroup lapack_helper_func_available isLAPACKAvailable
+
+\ingroup lapack_helper
+
+\brief Returns true is ArrayFire is compiled with LAPACK support
+
+===============================================================================
+
 @}
 */
diff --git a/docs/details/vision.dox b/docs/details/vision.dox
index 1d9d6b99ac..99582c3729 100644
--- a/docs/details/vision.dox
+++ b/docs/details/vision.dox
@@ -166,9 +166,12 @@ from the other and returns the result.
 
 \brief Template Matching
 
-Template matching is an image processing technique to find small patches of an image which
-match a given template image. A more in depth discussion on the topic can be found
-[here](http://en.wikipedia.org/wiki/Template_matching).
+Template matching is an image processing technique to find small patches of an image which match a given template image. Currently, this function doesn't support the following three metrics yet.
+- \ref AF_NCC
+- \ref AF_ZNCC
+- \ref AF_SHD
+
+A more in depth discussion about template matching can be found [here](http://en.wikipedia.org/wiki/Template_matching).
 
 =======================================================================
 
diff --git a/docs/pages/INSTALL.md b/docs/pages/INSTALL.md
index 3565889571..d31affaefe 100644
--- a/docs/pages/INSTALL.md
+++ b/docs/pages/INSTALL.md
@@ -108,13 +108,14 @@ First install the prerequisite packages:
     # Prerequisite packages:
     sudo apt-get install libfreeimage-dev libatlas3gf-base libfftw3-dev cmake
 
-Ubuntu 14.04 will not have the libglfw3-dev package in its repositories. You can either build the library from source (following the instructions listed) or install the library from a PPA as follows:
-
-```
-sudo apt-add repository ppa:keithw/glfw3
-sudo apt-get update
-sudo apt-get install glfw3
-```
+Ubuntu 14.04 will not have the libglfw3-dev package in its repositories. You can either build the
+library from source (following the
+[instructions listed here](https://github.com/arrayfire/arrayfire/wiki/GLFW-for-ArrayFire)) or
+install the library from a PPA as follows:
+
+    sudo apt-add-repository ppa:keithw/glfw3
+    sudo apt-get update
+    sudo apt-get install glfw3
 
 After this point, the installation should proceed identically to Ubuntu 14.10 or newer.
 
diff --git a/docs/pages/README.md b/docs/pages/README.md
index 302690242e..8a395a70af 100644
--- a/docs/pages/README.md
+++ b/docs/pages/README.md
@@ -76,7 +76,7 @@ Each ArrayFire installation comes with:
 ArrayFire supports batched operations on N-dimensional arrays.
 Batch operations in ArrayFire are run in parallel ensuring an optimal usage of your CUDA or OpenCL device.
 
-You can get the best performance out of ArrayFire using [vectorization techniques]().
+You can get the best performance out of ArrayFire using [vectorization techniques](\ref vectorization).
 
 ArrayFire can also execute loop iterations in parallel with
 [the gfor function](\ref gfor).
@@ -92,8 +92,8 @@ Read more about how [ArrayFire JIT](http://arrayfire.com/performance-of-arrayfir
 
 ## Simple Example
 
-Here's a live example to let you see ArrayFire code. You create [arrays](\ref
-construct_mat) which reside on CUDA or OpenCL devices. Then you can use
+Here's a live example to let you see ArrayFire code. You create [arrays](\ref construct_mat)
+which reside on CUDA or OpenCL devices. Then you can use
 [ArrayFire functions](modules.htm) on those [arrays](\ref construct_mat).
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
diff --git a/docs/pages/configuring_arrayfire_environment.md b/docs/pages/configuring_arrayfire_environment.md
index 054068e224..d554046f1e 100644
--- a/docs/pages/configuring_arrayfire_environment.md
+++ b/docs/pages/configuring_arrayfire_environment.md
@@ -18,6 +18,16 @@ This is the path with ArrayFire gets installed, ie. the includes and libs are
 present in this directory. You can use this variable to add include paths and
 libraries to your projects.
 
+AF_PRINT_ERRORS {#af_print_errors}
+-------------------------------------------------------------------------------
+
+When AF_PRINT_ERRORS is set to 1, the exceptions thrown are more verbose and
+detailed. This helps in locating the exact failure.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AF_PRINT_ERRORS=1 ./myprogram
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 AF_CUDA_DEFAULT_DEVICE {#af_cuda_default_device}
 -------------------------------------------------------------------------------
 
@@ -44,25 +54,116 @@ AF_OPENCL_DEFAULT_DEVICE=1 ./myprogram_opencl
 Note: af::setDevice call in the source code will take precedence over this
 variable.
 
+AF_OPENCL_DEFAULT_DEVICE_TYPE {#af_opencl_default_device_type}
+-------------------------------------------------------------------------------
+
+Use this variable to set the default OpenCL device type. Valid values for this
+variable are: CPU, GPU, ACC (Accelerators).
+
+When set, the first device of the specified type is chosen as default device.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AF_OPENCL_DEFAULT_DEVICE_TYPE=CPU ./myprogram_opencl
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Note: `AF_OPENCL_DEFAULT_DEVICE` and af::setDevice takes precedence over this variable.
+
+AF_OPENCL_DEVICE_TYPE {#af_opencl_device_type}
+-------------------------------------------------------------------------------
+
+Use this variable to only choose OpenCL devices of specified type. Valid values for this
+variable are:
+
+- ALL: All OpenCL devices. (Default behavior).
+- CPU: CPU devices only.
+- GPU: GPU devices only.
+- ACC: Accelerator devices only.
+
+When set, the remaining OpenCL device types are ignored by the OpenCL backend.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AF_OPENCL_DEVICE_TYPE=CPU ./myprogram_opencl
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+AF_OPENCL_CPU_OFFLOAD {#af_opencl_cpu_offload}
+-------------------------------------------------------------------------------
+
+When this variable is set to 1, and the selected OpenCL device has unified
+memory with the host (ie. `CL_DEVICE_HOST_UNIFIED_MEMORY` is true for device),
+then certain functions are offloaded to run on the CPU using mapped buffers.
+
+This takes advantage of fast libraries such as MKL while spending no time
+copying memory from device to host. The device memory is mapped to a host
+pointer which can be used in the offloaded functions.
+
+AF_OPENCL_SHOW_BUILD_INFO {#af_opencl_show_build_info}
+-------------------------------------------------------------------------------
+
+This variable is useful when debuggin OpenCL kernel compilation failures. When
+this variable is set to 1, and an error occurs during a OpenCL kernel
+compilation, then the log and kernel are printed to screen.
+
 AF_DISABLE_GRAPHICS {#af_disable_graphics}
 -------------------------------------------------------------------------------
 
-Setting this variable will disable window creation when graphics functions are
-being called. Simply setting this variable will disable functionality, any
-value will suffice. Disabling window creation will disable all other graphics
-calls at runtime as well.
+Setting this variable to 1 will disable window creation when graphics
+functions are being called. Disabling window creation will disable all other
+graphics calls at runtime as well.
 
 This is a useful enviornment variable when running code on servers and systems
 without displays. When graphics calls are run on such machines, they will
 print warning about window creation failing. To suppress those calls, set this
 variable.
 
-AF_PRINT_ERRORS {#af_print_errors}
+AF_SYNCHRONOUS_CALLS {#af_synchronous_calls}
 -------------------------------------------------------------------------------
 
-When AF_PRINT_ERRORS is set to 1, the exceptions thrown are more verbose and
-detailed. This helps in locating the exact failure.
+When this environment variable is set to 1, ArrayFire will execute all
+functions synchronously.
+
+AF_SHOW_LOAD_PATH {#af_show_load_path}
+-------------------------------------------------------------------------------
+
+When using the Unified backend, if this variable is set to 1, it will show the
+path where the ArrayFire backend libraries are loaded from.
+
+If the libraries are loaded from system paths, such as PATH or LD_LIBRARY_PATH
+etc, then it will print "system path". If the libraries are loaded from other
+paths, then those paths are shown in full.
+
+AF_MEM_DEBUG {#af_mem_debug}
+-------------------------------------------------------------------------------
+
+When AF_MEM_DEBUG is set to 1 (or anything not equal to 0), the caching mechanism in the memory manager.
+The device buffers are allocated using native functions as needed and freed when going out of scope.
+
+When the environment variable is not set, it is treated to be non zero.
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-AF_PRINT_ERRORS=1 ./myprogram_opencl
+AF_MEM_DEBUG=1 ./myprogram
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+AF_MAX_BUFFERS {#af_max_buffers}
+-------------------------------------------------------------------------
+
+When AF_MAX_BUFFERS is set, this environment variable specifies the maximum number of buffers allocated before garbage collection kicks in.
+
+Please note that the total number of buffers that can exist simultaneously can be higher than this number. This variable tells the garbage collector that it should free any available buffers immediately if the treshold is reached.
+
+When not set, the default value is 1000.
+
+AF_OPENCL_MAX_JIT_LEN {#af_opencl_max_jit_len}
+-------------------------------------------------------------------------------
+
+When set, this environment variable specifies the maximum length of the OpenCL JIT tree after which evaluation is forced. The default value for this is 16 for AMD devices and 20 otherwise.
+
+AF_CUDA_MAX_JIT_LEN {#af_cuda_max_jit_len}
+-------------------------------------------------------------------------------
+
+When set, this environment variable specifies the maximum length of the CUDA JIT tree after which evaluation is forced. The default value for this is 20.
+
+AF_CPU_MAX_JIT_LEN {#af_cpu_max_jit_len}
+-------------------------------------------------------------------------------
+
+When set, this environment variable specifies the maximum length of the CPU JIT tree after which evaluation is forced. The default value for this is 20.
diff --git a/docs/pages/release_notes.md b/docs/pages/release_notes.md
index 4f13cc7434..738d2b0a4f 100644
--- a/docs/pages/release_notes.md
+++ b/docs/pages/release_notes.md
@@ -1,6 +1,128 @@
 Release Notes {#releasenotes}
 ==============
 
+v3.3.0
+==============
+
+Major Updates
+-------------
+
+* CPU backend supports aysnchronous execution.
+* Performance improvements to OpenCL BLAS and FFT functions.
+* Improved performance of memory manager.
+* Improvements to visualization functions.
+* Improved sorted order for OpenCL devices.
+* Integration with external OpenCL projects.
+
+Features
+----------
+
+* \ref af::getActiveBackend(): Returns the current backend being used.
+* [Scatter plot](https://github.com/arrayfire/arrayfire/pull/1116) added to graphics.
+* \ref af::transform() now supports perspective transformation matrices.
+* \ref af::infoString(): Returns `af::info()` as a string.
+* \ref af::printMemInfo(): Print a table showing information about buffer from the memory manager
+    * The \ref AF_MEM_INFO macro prints numbers and total sizes of all buffers (requires including af/macros.h)
+* \ref af::allocHost(): Allocates memory on host.
+* \ref af::freeHost(): Frees host side memory allocated by arrayfire.
+* OpenCL functions can now use CPU implementation.
+    * Currently limited to Unified Memory devices (CPU and On-board Graphics).
+    * Functions: af::matmul() and all [LAPACK](\ref linalg_mat) functions.
+    * Takes advantage of optimized libraries such as MKL without doing memory copies.
+    * Use the environment variable `AF_OPENCL_CPU_OFFLOAD=1` to take advantage of this feature.
+* Functions specific to OpenCL backend.
+    * \ref afcl::addDevice(): Adds an external device and context to ArrayFire's device manager.
+    * \ref afcl::deleteDevice(): Removes an external device and context from ArrayFire's device manager.
+    * \ref afcl::setDevice(): Sets an external device and context from ArrayFire's device manager.
+    * \ref afcl::getDeviceType(): Gets the device type of the current device.
+    * \ref afcl::getPlatform(): Gets the platform of the current device.
+* \ref af::createStridedArray() allows [array creation user-defined strides](https://github.com/arrayfire/arrayfire/issues/1177) and device pointer.
+* [Expose functions](https://github.com/arrayfire/arrayfire/issues/1131) that provide information
+  about memory layout of Arrays.
+    * \ref af::getStrides(): Gets the strides for each dimension of the array.
+    * \ref af::getOffset(): Gets the offsets for each dimension of the array.
+    * \ref af::getRawPtr(): Gets raw pointer to the location of the array on device.
+    * \ref af::isLinear(): Returns true if all elements in the array are contiguous.
+    * \ref af::isOwner(): Returns true if the array owns the raw pointer, false if it is a sub-array.
+    * \ref af::getStrides(): Gets the strides of the array.
+    * \ref af::getStrides(): Gets the strides of the array.
+* \ref af::getDeviceId(): Gets the device id on which the array resides.
+* \ref af::isImageIOAvailable(): Returns true if ArrayFire was compiled with Freeimage enabled
+* \ref af::isLAPACKAvailable(): Returns true if ArrayFire was compiled with LAPACK functions enabled
+
+Bug Fixes
+--------------
+
+* Fixed [errors when using 3D / 4D arrays](https://github.com/arrayfire/arrayfire/pull/1251) in select and replace
+* Fixed [JIT errors on AMD devices](https://github.com/arrayfire/arrayfire/pull/1238) for OpenCL backend.
+* Fixed [imageio bugs](https://github.com/arrayfire/arrayfire/pull/1229) for 16 bit images.
+* Fixed [bugs when loading and storing images](https://github.com/arrayfire/arrayfire/pull/1228) natively.
+* Fixed [bug in FFT for NVIDIA GPUs](https://github.com/arrayfire/arrayfire/issues/615) when using OpenCL backend.
+* Fixed [bug when using external context](https://github.com/arrayfire/arrayfire/pull/1241) with OpenCL backend.
+* Fixed [memory leak](https://github.com/arrayfire/arrayfire/issues/1269) in \ref af_median_all().
+* Fixed [memory leaks and performance](https://github.com/arrayfire/arrayfire/pull/1274) in graphics functions.
+* Fixed [bugs when indexing followed by moddims](https://github.com/arrayfire/arrayfire/issues/1275).
+* \ref af_get_revision() now returns actual commit rather than AF_REVISION.
+* Fixed [releasing arrays](https://github.com/arrayfire/arrayfire/issues/1282) when using different backends.
+* OS X OpenCL: [LAPACK functions](\ref linalg_mat) on CPU devices use OpenCL offload (previously threw errors).
+* [Add support for 32-bit integer image types](https://github.com/arrayfire/arrayfire/pull/1287) in Image IO.
+* Fixed [set operations for row vectors](https://github.com/arrayfire/arrayfire/issues/1300)
+* Fixed [bugs](https://github.com/arrayfire/arrayfire/issues/1243) in \ref af::meanShift() and af::orb().
+
+Improvements
+--------------
+
+* Optionally [offload BLAS and LAPACK](https://github.com/arrayfire/arrayfire/pull/1221) functions to CPU implementations to improve performance.
+* Performance improvements to the memory manager.
+* Error messages are now more detailed.
+* Improved sorted order for OpenCL devices.
+* JIT heuristics can now be tweaked using environment variables. See
+  [Environment Variables](\ref configuring_environment) tutorial.
+* Add `BUILD_<BACKEND>` [options to examples and tests](https://github.com/arrayfire/arrayfire/issues/1286)
+  to toggle backends when compiling independently.
+
+Examples
+----------
+
+* New visualization [example simulating gravity](\ref graphics/gravity_sim.cpp).
+
+Build
+----------
+
+* Support for Intel `icc` compiler
+* Support to compile with Intel MKL as a BLAS and LAPACK provider
+* Tests are now available for building as standalone (like examples)
+* Tests can now be built as a single file for each backend
+* Better handling of NONFREE build options
+* [Searching for GLEW in CMake default paths](https://github.com/arrayfire/arrayfire/pull/1292)
+* Fixes for compiling with MKL on OSX.
+
+Installers
+----------
+* Improvements to OSX Installer
+    * CMake config files are now installed with libraries
+    * Independent options for installing examples and documentation components
+
+Deprecations
+-----------
+
+* `af_lock_device_arr` is now deprecated to be removed in v4.0.0. Use \ref af_lock_array() instead.
+* `af_unlock_device_arr` is now deprecated to be removed in v4.0.0. use \ref af_unlock_array() instead.
+
+Documentation
+--------------
+
+* Fixes to documentation for \ref matchTemplate().
+* Improved documentation for deviceInfo.
+* Fixes to documentation for \ref exp().
+
+Known Issues
+------------
+
+* [Solve OpenCL fails on NVIDIA Maxwell devices](https://github.com/arrayfire/arrayfire/issues/1246)
+  for f32 and c32 when M > N and K % 4 is 1 or 2.
+
+
 v3.2.2
 ==============
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 4710d1b739..be0f6407be 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -61,13 +61,17 @@ ENDMACRO()
 # and TARGET_LINK_LIBRARIES(... ${ARRAYFIRE_LIBRARIES}) are needed
 MACRO(BUILD_ALL FILES BACKEND_NAME BACKEND_LIBRARIES OTHER_LIBRARIES)
 
-    FOREACH(FILE ${FILES})
-        GET_FILENAME_COMPONENT(EXAMPLE ${FILE} NAME_WE)
-        GET_FILENAME_COMPONENT(FULL_DIR_NAME ${FILE} PATH)
-        GET_FILENAME_COMPONENT(DIR_NAME ${FULL_DIR_NAME} NAME)
+    STRING(TOUPPER ${BACKEND_NAME} BACKEND_NAME_UPPER)
+    MESSAGE(STATUS "EXAMPLES: ${BACKEND_NAME_UPPER} backend is ${BUILD_${BACKEND_NAME_UPPER}}.")
+    IF(${BUILD_${BACKEND_NAME_UPPER}})
+        FOREACH(FILE ${FILES})
+            GET_FILENAME_COMPONENT(EXAMPLE ${FILE} NAME_WE)
+            GET_FILENAME_COMPONENT(FULL_DIR_NAME ${FILE} PATH)
+            GET_FILENAME_COMPONENT(DIR_NAME ${FULL_DIR_NAME} NAME)
 
-        BUILD_EXAMPLE(${EXAMPLE} ${FILE} ${BACKEND_NAME} "${BACKEND_LIBRARIES}" "${OTHER_LIBRARIES}" ${DIR_NAME})
-    ENDFOREACH()
+            BUILD_EXAMPLE(${EXAMPLE} ${FILE} ${BACKEND_NAME} "${BACKEND_LIBRARIES}" "${OTHER_LIBRARIES}" ${DIR_NAME})
+        ENDFOREACH()
+    ENDIF()
 ENDMACRO()
 
 # Collect the source
@@ -76,10 +80,9 @@ ADD_DEFINITIONS("-DASSETS_DIR=\"${ASSETS_DIR}\"")
 
 # Next we build each example using every backend.
 IF(${ArrayFire_CPU_FOUND})  # variable defined by FIND(ArrayFire ...)
-    MESSAGE(STATUS "EXAMPLES: CPU backend is ON.")
+    OPTION(BUILD_CPU "Build ArrayFire Examples for CPU backend" ON)
     BUILD_ALL("${FILES}" cpu ${ArrayFire_CPU_LIBRARIES} "")
 ELSEIF(TARGET afcpu)        # variable defined by the ArrayFire build tree
-    MESSAGE(STATUS "EXAMPLES: CPU backend is ON.")
     BUILD_ALL("${FILES}" cpu afcpu "")
 ELSE()
     MESSAGE(STATUS "EXAMPLES: CPU backend is OFF. afcpu was not found.")
@@ -87,10 +90,9 @@ ENDIF()
 
 # Next we build each example using every backend.
 IF(${ArrayFire_Unified_FOUND})  # variable defined by FIND(ArrayFire ...)
-    MESSAGE(STATUS "EXAMPLES: UNIFIED backend is ON.")
+    OPTION(BUILD_UNIFIED "Build ArrayFire Examples for Unified backend" ON)
     BUILD_ALL("${FILES}" unified ${ArrayFire_Unified_LIBRARIES} "${CMAKE_DL_LIBS}")
 ELSEIF(TARGET af)        # variable defined by the ArrayFire build tree
-    MESSAGE(STATUS "EXAMPLES: UNIFIED backend is ON.")
     BUILD_ALL("${FILES}" unified af "${CMAKE_DL_LIBS}")
 ELSE()
     MESSAGE(STATUS "EXAMPLES: UNIFIED backend is OFF. af was not found.")
@@ -104,10 +106,10 @@ IF (${CUDA_FOUND})
           PATHS ${CUDA_TOOLKIT_ROOT_DIR}
           DOC "CUDA NVVM Library"
           )
-        MESSAGE(STATUS "EXAMPLES: CUDA backend is ON.")
+        MARK_AS_ADVANCED(CUDA_NVVM_LIBRARY)
+        OPTION(BUILD_CUDA "Build ArrayFire Examples for CUDA backend" ON)
         BUILD_ALL("${FILES}" cuda ${ArrayFire_CUDA_LIBRARIES} "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
     ELSEIF(TARGET afcuda)        # variable defined by the ArrayFire build tree
-        MESSAGE(STATUS "EXAMPLES: CUDA backend is ON.")
         BUILD_ALL("${FILES}" cuda afcuda "")
     ELSE()
         MESSAGE(STATUS "EXAMPLES: CUDA backend is OFF. afcuda was not found")
@@ -118,10 +120,9 @@ ENDIF()
 
 IF (${OpenCL_FOUND})
     IF(${ArrayFire_OpenCL_FOUND})  # variable defined by FIND(ArrayFire ...)
-        MESSAGE(STATUS "EXAMPLES: OpenCL backend is ON.")
+        OPTION(BUILD_OPENCL "Build ArrayFire Examples for OpenCL backend" ON)
         BUILD_ALL("${FILES}" opencl ${ArrayFire_OpenCL_LIBRARIES} "${OpenCL_LIBRARIES}")
     ELSEIF(TARGET afopencl)        # variable defined by the ArrayFire build tree
-        MESSAGE(STATUS "EXAMPLES: OpenCL backend is ON.")
         BUILD_ALL("${FILES}" opencl afopencl "${OpenCL_LIBRARIES}")
     ELSE()
         MESSAGE(STATUS "EXAMPLES: OpenCL backend is OFF. afopencl was not found")
diff --git a/examples/graphics/fractal.cpp b/examples/graphics/fractal.cpp
index 9ac5a86ea9..9781b61c90 100644
--- a/examples/graphics/fractal.cpp
+++ b/examples/graphics/fractal.cpp
@@ -10,13 +10,14 @@
 #include <stdio.h>
 #include <iostream>
 #include <arrayfire.h>
-#include <math.h>
+#include <cmath>
 #include <cstdlib>
 
 #define WIDTH 400 // Width of image
 #define HEIGHT 400 // Width of image
 
 using namespace af;
+using std::abs;
 
 array complex_grid(int width, int height, float zoom, float center[2])
 {
diff --git a/examples/graphics/gravity_sim.cpp b/examples/graphics/gravity_sim.cpp
new file mode 100644
index 0000000000..3fc19d8c65
--- /dev/null
+++ b/examples/graphics/gravity_sim.cpp
@@ -0,0 +1,140 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <arrayfire.h>
+#include <iostream>
+#include <cstdio>
+
+using namespace af;
+using namespace std;
+
+static const int width = 512, height = 512;
+static const int pixels_per_unit = 20;
+
+void simulate(af::array *pos, af::array *vels, af::array *forces, float dt){
+    pos[0] += vels[0] * pixels_per_unit * dt;
+    pos[1] += vels[1] * pixels_per_unit * dt;
+
+    //calculate distance to center
+    af::array diff_x = pos[0] - width/2;
+    af::array diff_y = pos[1] - height/2;
+    af::array dist = sqrt( diff_x*diff_x + diff_y*diff_y );
+
+    //calculate normalised force vectors
+    forces[0] = -1 * diff_x / dist;
+    forces[1] = -1 * diff_y / dist;
+    //update force scaled to time and magnitude constant
+    forces[0] *= pixels_per_unit * dt;
+    forces[1] *= pixels_per_unit * dt;
+
+    //dampening
+    vels[0] *= 1 - (0.005*dt);
+    vels[1] *= 1 - (0.005*dt);
+
+    //update velocities from forces
+    vels[0] += forces[0];
+    vels[1] += forces[1];
+
+}
+
+void collisions(af::array *pos, af::array *vels){
+    //clamp particles inside screen border
+    af::array projected_px = min(width, max(0, pos[0]));
+    af::array projected_py = min(height - 1, max(0, pos[1]));
+
+    //calculate distance to center
+    af::array diff_x = projected_px - width/2;
+    af::array diff_y = projected_py - height/2;
+    af::array dist = sqrt( diff_x*diff_x + diff_y*diff_y );
+
+    //collide with center sphere
+    const int radius = 50;
+    const float elastic_constant = 0.91f;
+    if(sum<int>(dist<radius) > 0) {
+        vels[0](dist<radius) = -elastic_constant * vels[0](dist<radius);
+        vels[1](dist<radius) = -elastic_constant * vels[1](dist<radius);
+
+        //normalize diff vector
+        diff_x /= dist;
+        diff_y /= dist;
+        //place all particle colliding with sphere on surface
+        pos[0](dist<radius) = width/2 + diff_x(dist<radius) * radius;
+        pos[1](dist<radius) = height/2 +  diff_y(dist<radius) * radius;
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    try {
+        const static int total_particles = 1000;
+        static const int reset = 500;
+
+        af::info();
+
+        af::Window myWindow(width, height, "Gravity Simulation using ArrayFire");
+
+        int frame_count = 0;
+
+        // Initialize the kernel array just once
+        const af::array draw_kernel = gaussianKernel(3, 3);
+
+        af::array pos[2];
+        af::array vels[2];
+        af::array forces[2];
+
+        // Generate a random starting state
+        pos[0] = af::randu(total_particles) * width;
+        pos[1] = af::randu(total_particles) * height;
+
+        vels[0] = af::randn(total_particles);
+        vels[1] = af::randn(total_particles);
+
+        forces[0] = af::randn(total_particles);
+        forces[1] = af::randn(total_particles);
+
+        af::array image = af::constant(0, width, height);
+        af::array ids(total_particles, u32);
+
+        af::timer timer = af::timer::start();
+        while(!myWindow.close()) {
+            float dt = af::timer::stop(timer);
+            timer = af::timer::start();
+
+            ids = (pos[0].as(u32) * height) + pos[1].as(u32);
+            image(ids) += 255;
+            image = convolve2(image, draw_kernel);
+            myWindow.image(image);
+            image = af::constant(0, image.dims());
+            frame_count++;
+
+            // Generate a random starting state
+            if(frame_count % reset == 0) {
+                pos[0] = af::randu(total_particles) * width;
+                pos[1] = af::randu(total_particles) * height;
+
+                vels[0] = af::randn(total_particles);
+                vels[1] = af::randn(total_particles);
+            }
+
+            //check for collisions and adjust positions/velocities accordingly
+            collisions(pos, vels);
+
+            //run force simulation and update particles
+            simulate(pos, vels, forces, dt);
+
+        }
+    } catch (af::exception& e) {
+        fprintf(stderr, "%s\n", e.what());
+        throw;
+    }
+
+    return 0;
+}
+
diff --git a/examples/graphics/plot2d.cpp b/examples/graphics/plot2d.cpp
index 4f68b92b30..9b311f55f9 100644
--- a/examples/graphics/plot2d.cpp
+++ b/examples/graphics/plot2d.cpp
@@ -13,7 +13,7 @@
 
 using namespace af;
 
-static const int ITERATIONS = 100;
+static const int ITERATIONS = 50;
 static const float PRECISION = 1.0f/ITERATIONS;
 
 int main(int argc, char *argv[])
@@ -21,17 +21,22 @@ int main(int argc, char *argv[])
     try {
         // Initialize the kernel array just once
         af::info();
-        af::Window myWindow(512, 512, "2D Plot example: ArrayFire");
+        af::Window myWindow(1024, 512, "2D Plot example: ArrayFire");
 
         array Y;
         int sign = 1;
         array X = seq(-af::Pi, af::Pi, PRECISION);
+        array noise = randn(X.dims(0))/5.f;
 
+        myWindow.grid(1, 2);
         for (double val=-af::Pi; !myWindow.close(); ) {
 
             Y = sin(X);
 
-            myWindow.plot(X, Y);
+            myWindow(0,0).plot(X, Y);
+            myWindow(0,1).scatter(X, Y + noise, AF_MARKER_POINT);
+
+            myWindow.show();
 
             X = X + PRECISION * float(sign);
             val += PRECISION * float(sign);
diff --git a/examples/image_processing/adaptive_thresholding.cpp b/examples/image_processing/adaptive_thresholding.cpp
index 5ce34e76be..1004285148 100644
--- a/examples/image_processing/adaptive_thresholding.cpp
+++ b/examples/image_processing/adaptive_thresholding.cpp
@@ -13,6 +13,7 @@
 #include <arrayfire.h>
 
 using namespace af;
+using std::abs;
 
 typedef enum {
     MEAN = 0,
diff --git a/examples/image_processing/brain_segmentation.cpp b/examples/image_processing/brain_segmentation.cpp
index 7349bf258b..253d37e5f1 100644
--- a/examples/image_processing/brain_segmentation.cpp
+++ b/examples/image_processing/brain_segmentation.cpp
@@ -23,10 +23,12 @@ const float h_sy_kernel[] = { -1, 0, 1,
     -2, 0, 2,
     -1, 0, 1
 };
-const float h_lp_kernel[] = { -0.5f, -1.0f, -0.5f,
-    -1.0f,  6.0f, -1.0f,
-    -0.5f, -1.0f, -0.5f
-};
+
+// Unused
+//const float h_lp_kernel[] = { -0.5f, -1.0f, -0.5f,
+//    -1.0f,  6.0f, -1.0f,
+//    -0.5f, -1.0f, -0.5f
+//};
 
 array edges_slice(array x)
 {
diff --git a/examples/image_processing/filters.cpp b/examples/image_processing/filters.cpp
index 8b75acf063..ae1d7c155c 100644
--- a/examples/image_processing/filters.cpp
+++ b/examples/image_processing/filters.cpp
@@ -151,7 +151,7 @@ array medianfilter(const array &in, int window_width, int window_height)
     return ret_val;
 }
 
-array gaussianblur(const array &in, int window_width, int window_height, int sigma)
+array gaussianblur(const array &in, int window_width, int window_height, double sigma)
 {
     array g = gaussianKernel(window_width, window_height, sigma, sigma);
     return convolve(in, g);
diff --git a/include/af/array.h b/include/af/array.h
index 03f3eeb23a..de746d9384 100644
--- a/include/af/array.h
+++ b/include/af/array.h
@@ -672,6 +672,8 @@ namespace af
            Get the device pointer from the array and lock the buffer in memory manager.
            @{
 
+           The device memory returned by this function is not freed until unlock() is called.
+
            \ingroup arrayfire_func
            \ingroup device_mat
         */
@@ -961,7 +963,7 @@ namespace af
         /// \brief Locks the device buffer in the memory manager.
         ///
         /// This method can be called to take control of the device pointer from the memory manager.
-        /// While a buffer is locked, the memory manager does not free the memory.
+        /// While a buffer is locked, the memory manager doesn't free the memory until unlock() is invoked.
         void lock() const;
 
         ///
diff --git a/include/af/backend.h b/include/af/backend.h
index 93d8d8de58..94c4951d45 100644
--- a/include/af/backend.h
+++ b/include/af/backend.h
@@ -55,6 +55,29 @@ AFAPI af_err af_get_available_backends(int* backends);
 AFAPI af_err af_get_backend_id(af_backend *backend, const af_array in);
 #endif
 
+#if AF_API_VERSION >= 33
+/**
+   \param[out] backend takes one of the values of enum \ref af_backend
+   from the backend that is currently set to active
+   \returns \ref af_err error code
+
+   \ingroup unified_func_getactivebackend
+ */
+AFAPI af_err af_get_active_backend(af_backend *backend);
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   \param[out] device contains the device on which \p in was created.
+   \param[in] in is the array who's device is to be queried.
+   \returns \ref af_err error code
+
+   \ingroup unified_func_getdeviceid
+ */
+AFAPI af_err af_get_device_id(int *device, const af_array in);
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
@@ -101,5 +124,26 @@ AFAPI int getAvailableBackends();
 AFAPI af::Backend getBackendId(const array &in);
 #endif
 
+#if AF_API_VERSION >= 33
+/**
+   \returns \ref af_backend which is the backend is currently active
+
+   \ingroup unified_func_getctivebackend
+ */
+AFAPI af::Backend getActiveBackend();
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   \param[in] in is the array who's device is to be queried.
+   \returns The id of the device on which this array was created.
+
+   \note Device ID can be the same for arrays belonging to different backends.
+
+   \ingroup unified_func_getdeviceid
+ */
+AFAPI int getDeviceId(const array &in);
+#endif
+
 }
 #endif
diff --git a/include/af/defines.h b/include/af/defines.h
index a25d23996d..77508f2870 100644
--- a/include/af/defines.h
+++ b/include/af/defines.h
@@ -120,6 +120,13 @@ typedef enum {
     AF_ERR_BATCH          = 207,
 
 
+#if AF_API_VERSION >= 33
+    ///
+    /// Input does not belong to the current device.
+    ///
+    AF_ERR_DEVICE         = 208,
+#endif
+
     // 300-399 Errors for missing software features
 
     ///
@@ -378,6 +385,19 @@ typedef enum {
     AF_ID = 0
 } af_someenum_t;
 
+#if AF_API_VERSION >=32
+typedef enum {
+    AF_MARKER_NONE         = 0,
+    AF_MARKER_POINT        = 1,
+    AF_MARKER_CIRCLE       = 2,
+    AF_MARKER_SQUARE       = 3,
+    AF_MARKER_TRIANGLE     = 4,
+    AF_MARKER_CROSS        = 5,
+    AF_MARKER_PLUS         = 6,
+    AF_MARKER_STAR         = 7
+} af_marker_type;
+#endif
+
 #ifdef __cplusplus
 namespace af
 {
@@ -404,6 +424,9 @@ namespace af
 #if AF_API_VERSION >= 32
     typedef af_backend Backend;
 #endif
+#if AF_API_VERSION >= 32
+    typedef af_marker_type markerType;
+#endif
 }
 
 #endif
diff --git a/include/af/device.h b/include/af/device.h
index 826863e6d8..b08bd519b3 100644
--- a/include/af/device.h
+++ b/include/af/device.h
@@ -29,20 +29,33 @@ namespace af
     */
 
     /**
-       \defgroup device_func_prop deviceInfo
+       \defgroup device_func_info_string infoString
 
-       Get device information
+       Get af::info() as a string
 
        @{
 
+       \brief Returns the output of af::info() as a string
+
+       \param[in] verbose flag to return verbose info
+
+       \returns string containing output of af::info()
+
        \ingroup arrayfire_func
        \ingroup device_mat
     */
-    AFAPI void deviceInfo(char* d_name, char* d_platform, char *d_toolkit, char* d_compute);
+    AFAPI const char* infoString(const bool verbose = false);
     /**
        @}
     */
 
+    /**
+        \copydoc device_func_prop
+
+        \ingroup device_func_prop
+    */
+    AFAPI void deviceInfo(char* d_name, char* d_platform, char *d_toolkit, char* d_compute);
+
     /// \brief Gets the number of devices
     ///
     /// \copydoc device_func_count
@@ -87,6 +100,8 @@ namespace af
     /// \param[in] type is the type of the elements to allocate
     /// \returns the pointer to the memory
     ///
+    /// \note The device memory returned by this function is only freed if af::free() is called explicitly
+
     AFAPI void *alloc(const size_t elements, const dtype type);
 
     /// \brief Allocates memory using ArrayFire's memory manager
@@ -97,10 +112,20 @@ namespace af
     ///
     /// \note the size of the memory allocated is the number of \p elements *
     ///         sizeof(type)
+    ///
+    /// \note The device memory returned by this function is only freed if af::free() is called explicitly
     template<typename T>
     T* alloc(const size_t elements);
     /// @}
 
+    /// \ingroup device_func_free
+    ///
+    /// \copydoc device_func_free
+    /// \param[in] ptr the memory to free
+    ///
+    /// This function will free a device pointer even if it has been previously locked.
+    AFAPI void free(const void *ptr);
+
     /// \ingroup device_func_pinned
     /// @{
     ///
@@ -119,15 +144,51 @@ namespace af
     T* pinned(const size_t elements);
     /// @}
 
-    /// \ingroup device_func_free
-    /// @{
-    /// \copydoc device_func_free
+    /// \ingroup device_func_free_pinned
+    ///
+    /// \copydoc device_func_free_pinned
     /// \param[in] ptr the memory to free
-    AFAPI void free(const void *ptr);
-
-    /// \copydoc free()
     AFAPI void freePinned(const void *ptr);
-    ///@}
+
+#if AF_API_VERSION >= 33
+    /// \brief Allocate memory on host
+    ///
+    /// \copydoc device_func_alloc_host
+    ///
+    /// \param[in] elements the number of elements to allocate
+    /// \param[in] type is the type of the elements to allocate
+    /// \returns the pointer to the memory
+    ///
+    /// \ingroup device_func_alloc_host
+    AFAPI void *allocHost(const size_t elements, const dtype type);
+#endif
+
+#if AF_API_VERSION >= 33
+    /// \brief Allocate memory on host
+    ///
+    /// \copydoc device_func_alloc_host
+    ///
+    /// \param[in] elements the number of elements to allocate
+    /// \returns the pointer to the memory
+    ///
+    /// \note the size of the memory allocated is the number of \p elements *
+    ///         sizeof(type)
+    ///
+    /// \ingroup device_func_alloc_host
+    template<typename T>
+    AFAPI T* allocHost(const size_t elements);
+#endif
+
+#if AF_API_VERSION >= 33
+    /// \brief Free memory allocated internally by ArrayFire
+    //
+    /// \copydoc device_func_free_host
+    ///
+    /// \param[in] ptr the memory to free
+    ///
+    /// \ingroup device_func_free_host
+    AFAPI void freeHost(const void *ptr);
+#endif
 
     /// \ingroup device_func_mem
     /// @{
@@ -139,9 +200,25 @@ namespace af
     //                              manager
     /// \param[out] lock_bytes The number of bytes in use
     /// \param[out] lock_buffers The number of buffers in use
+    ///
+    /// \note This function performs a synchronization operation
     AFAPI void deviceMemInfo(size_t *alloc_bytes, size_t *alloc_buffers,
                              size_t *lock_bytes, size_t *lock_buffers);
 
+#if AF_API_VERSION >= 33
+    ///
+    /// Prints buffer details from the ArrayFire Device Manager
+    //
+    /// \param [in] msg A message to print before the table
+    /// \param [in] device_id print the memory info of the specified device.
+    ///  -1 signifies active device.
+    //
+    /// \ingroup device_func_mem
+    ///
+    /// \note This function performs a synchronization operation
+    AFAPI void printMemInfo(const char *msg = NULL, const int device_id = -1);
+#endif
+
     /// \brief Call the garbage collection function in the memory manager
     ///
     /// \ingroup device_func_mem
@@ -169,10 +246,25 @@ extern "C" {
     */
     AFAPI af_err af_info();
 
+    /**
+       \ingroup device_func_info
+    */
     AFAPI af_err af_init();
 
     /**
-       \ingroup device_func_info
+       \brief Gets the output of af_info() as a string
+
+       \param[out] str contains the string
+       \param[in] verbose flag to return verbose info
+
+       \ingroup device_func_info_string
+    */
+    AFAPI af_err af_info_string(char** str, const bool verbose);
+
+    /**
+        \copydoc device_func_prop
+
+        \ingroup device_func_prop
     */
     AFAPI af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute);
 
@@ -203,24 +295,42 @@ extern "C" {
 
     /**
        \ingroup device_func_alloc
+
+       This device memory returned by this function can only be freed using af_free_device
     */
     AFAPI af_err af_alloc_device(void **ptr, const dim_t bytes);
 
     /**
-       \ingroup device_func_pinned
+       \ingroup device_func_free
+
+       This function will free a device pointer even if it has been previously locked.
     */
-    AFAPI af_err af_alloc_pinned(void **ptr, const dim_t bytes);
+    AFAPI af_err af_free_device(void *ptr);
 
     /**
-       \ingroup device_func_free
+       \ingroup device_func_pinned
     */
-    AFAPI af_err af_free_device(void *ptr);
+    AFAPI af_err af_alloc_pinned(void **ptr, const dim_t bytes);
 
     /**
        \ingroup device_func_free_pinned
     */
     AFAPI af_err af_free_pinned(void *ptr);
 
+#if AF_API_VERSION >= 33
+    /**
+       \ingroup device_func_alloc_host
+    */
+    AFAPI af_err af_alloc_host(void **ptr, const dim_t bytes);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \ingroup device_func_free_host
+    */
+    AFAPI af_err af_free_host(void *ptr);
+#endif
+
     /**
        Create array from device memory
        \ingroup construct_mat
@@ -234,6 +344,21 @@ extern "C" {
     AFAPI af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers,
                                     size_t *lock_bytes, size_t *lock_buffers);
 
+#if AF_API_VERSION >= 33
+    ///
+    /// Prints buffer details from the ArrayFire Device Manager
+    //
+    /// \param [in] msg A message to print before the table
+    /// \param [in] device_id print the memory info of the specified device.
+    ///  -1 signifies active device.
+    ///
+    /// return AF_SUCCESS if successful
+    ///
+    /// \ingroup device_func_mem
+    ///
+    AFAPI af_err af_print_mem_info(const char *msg, const int device_id);
+#endif
+
     /**
        Call the garbage collection routine
        \ingroup device_func_mem
@@ -256,9 +381,12 @@ extern "C" {
     /**
        Lock the device buffer in the memory manager.
 
-       Locked buffers are not freed by memory manager until \ref af_unlock_device_ptr is called.
+       Locked buffers are not freed by memory manager until \ref af_unlock_array is called.
        \ingroup device_func_mem
     */
+#if AF_API_VERSION >= 33
+    DEPRECATED("Use af_lock_array instead")
+#endif
     AFAPI af_err af_lock_device_ptr(const af_array arr);
 #endif
 
@@ -269,9 +397,32 @@ extern "C" {
        This function will give back the control over the device pointer to the memory manager.
        \ingroup device_func_mem
     */
+#if AF_API_VERSION >= 33
+    DEPRECATED("Use af_unlock_array instead")
+#endif
     AFAPI af_err af_unlock_device_ptr(const af_array arr);
 #endif
 
+#if AF_API_VERSION >= 33
+    /**
+       Lock the device buffer in the memory manager.
+
+       Locked buffers are not freed by memory manager until \ref af_unlock_array is called.
+       \ingroup device_func_mem
+    */
+    AFAPI af_err af_lock_array(const af_array arr);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       Unlock device buffer in the memory manager.
+
+       This function will give back the control over the device pointer to the memory manager.
+       \ingroup device_func_mem
+    */
+    AFAPI af_err af_unlock_array(const af_array arr);
+#endif
+
     /**
        Get the device pointer and lock the buffer in memory manager.
 
diff --git a/include/af/exception.h b/include/af/exception.h
index ee10c5db7b..a43d26dbaa 100644
--- a/include/af/exception.h
+++ b/include/af/exception.h
@@ -27,6 +27,9 @@ class AFAPI exception : public std::exception
     exception(const char *msg);
     exception(const char *file, unsigned line, af_err err);
     exception(const char *msg, const char *file, unsigned line, af_err err);
+#if AF_API_VERSION >= 33
+    exception(const char *msg, const char *func, const char *file, unsigned line, af_err err);
+#endif
     virtual ~exception() throw() {}
     virtual const char *what() const throw() { return m_msg; }
     friend inline std::ostream& operator<<(std::ostream &s, const exception &e)
diff --git a/include/af/graphics.h b/include/af/graphics.h
index 17cb622383..b69a83854a 100644
--- a/include/af/graphics.h
+++ b/include/af/graphics.h
@@ -30,6 +30,8 @@ namespace af
 
    \brief Window object to render af::arrays
 
+   Windows are not CopyConstructible or CopyAssignable.
+
    \ingroup graphics_func
  */
 class AFAPI Window {
@@ -43,6 +45,9 @@ class AFAPI Window {
 
         void initWindow(const int width, const int height, const char* const title);
 
+        Window(const Window&);                 // Prevent copy-construction
+        Window& operator=(const Window&);      // Prevent assignment
+
     public:
         /**
            Creates a window object with default width
@@ -84,6 +89,7 @@ class AFAPI Window {
            \ingroup gfx_func_window
          */
         Window(const af_window wnd);
+
         /**
            Destroys the window handle
 
@@ -177,9 +183,39 @@ class AFAPI Window {
 
            \ingroup gfx_func_draw
          */
-
         void plot(const array& X, const array& Y, const char* const title=NULL);
 
+#if AF_API_VERSION >= 33
+        /**
+           Renders the input arrays as a 2D scatter-plot to the window
+
+           \param[in] X is an \ref array with the x-axis data points
+           \param[in] Y is an \ref array with the y-axis data points
+           \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot
+           \param[in] title parameter is used when this function is called in grid mode
+
+           \note \p X and \p Y should be vectors.
+
+           \ingroup gfx_func_draw
+         */
+        void scatter(const array& X, const array& Y,
+                     const af::markerType marker = AF_MARKER_POINT, const char* const title = NULL);
+#endif
+
+#if AF_API_VERSION >= 33
+        /**
+           Renders the input arrays as a 3D scatter-plot to the window
+
+           \param[in] P is an \ref af_array or matrix with the xyz-values of the points
+           \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot
+           \param[in] title parameter is used when this function is called in grid mode
+
+           \ingroup gfx_func_draw
+         */
+        void scatter3(const array& P, const af::markerType marker = AF_MARKER_POINT,
+                      const char* const title = NULL);
+#endif
+
         /**
            Renders the input array as a histogram to the window
 
@@ -253,6 +289,17 @@ class AFAPI Window {
         */
         bool close();
 
+#if AF_API_VERSION >= 33
+        /**
+           Hide/Show the window
+
+           \param[in] isVisible indicates if the window is to be hidden or brought into focus
+
+           \ingroup gfx_func_window
+         */
+        void setVisibility(const bool isVisible);
+#endif
+
         /**
            This function is used to keep track of which cell in the grid mode is
            being currently rendered. When a user does Window(0,0), we internally
@@ -371,6 +418,47 @@ AFAPI af_err af_draw_image(const af_window wind, const af_array in, const af_cel
 */
 AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props);
 
+#if AF_API_VERSION >= 33
+/**
+   C Interface wrapper for drawing an array as a plot
+
+   \param[in]   wind is the window handle
+   \param[in]   X is an \ref af_array with the x-axis data points
+   \param[in]   Y is an \ref af_array with the y-axis data points
+   \param[in]   marker is an \ref af_marker_type enum specifying which marker to use in the scatter plot
+   \param[in]   props is structure \ref af_cell that has the properties that are used
+   for the current rendering.
+
+   \return     \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code
+   is returned.
+
+   \note \p X and \p Y should be vectors.
+
+   \ingroup gfx_func_draw
+*/
+AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y,
+                             const af_marker_type marker, const af_cell* const props);
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   C Interface wrapper for drawing an array as a plot
+
+   \param[in]   wind is the window handle
+   \param[in]   P is an \ref af_array or matrix with the xyz-values of the points
+   \param[in]   marker is an \ref af_marker_type enum specifying which marker to use in the scatter plot
+   \param[in]   props is structure \ref af_cell that has the properties that are used
+   for the current rendering.
+
+   \return     \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code
+   is returned.
+
+   \ingroup gfx_func_draw
+*/
+AFAPI af_err af_draw_scatter3(const af_window wind, const af_array P,
+                              const af_marker_type marker, const af_cell* const props);
+#endif
+
 #if AF_API_VERSION >= 32
 /**
    C Interface wrapper for drawing an array as a plot
@@ -470,6 +558,18 @@ AFAPI af_err af_show(const af_window wind);
 */
 AFAPI af_err af_is_window_closed(bool *out, const af_window wind);
 
+#if AF_API_VERSION >= 33
+/**
+   Hide/Show a window
+
+   \param[in] wind is the window whose visibility is to be changed
+   \param[in] is_visible indicates if the window is to be hidden or brought into focus
+
+   \ingroup gfx_func_window
+ */
+AFAPI af_err af_set_visibility(const af_window wind, const bool is_visible);
+#endif
+
 /**
    C Interface wrapper for destroying a window handle
 
diff --git a/include/af/image.h b/include/af/image.h
index f38bb41694..0e0c0ba901 100644
--- a/include/af/image.h
+++ b/include/af/image.h
@@ -147,6 +147,16 @@ AFAPI array loadImageNative(const char* filename);
 AFAPI void saveImageNative(const char* filename, const array& in);
 #endif
 
+#if AF_API_VERSION >= 33
+/**
+    Function to check if Image IO is available
+
+    \returns true if ArrayFire was commpiled with ImageIO support, false otherwise.
+    \ingroup imageio_func_available
+*/
+AFAPI bool isImageIOAvailable();
+#endif
+
 /**
     C++ Interface for resizing an image to specified dimensions
 
@@ -213,6 +223,20 @@ AFAPI array rotate(const array& in, const float theta, const bool crop=true, con
 */
 AFAPI array transform(const array& in, const array& transform, const dim_t odim0 = 0, const dim_t odim1 = 0, const interpType method=AF_INTERP_NEAREST, const bool inverse=true);
 
+#if AF_API_VERSION >= 33
+/**
+    C++ Interface for transforming coordinates
+
+    \param[in] tf is transformation matrix
+    \param[in] d0 is the first input dimension
+    \param[in] d1 is the second input dimension
+    \return the transformed coordinates
+
+    \ingroup transform_func_coordinates
+*/
+AFAPI array transformCoordinates(const array& tf, const float d0, const float d1);
+#endif
+
 /**
     C++ Interface for translating an image
 
@@ -794,6 +818,20 @@ extern "C" {
     AFAPI af_err af_save_image_native(const char* filename, const af_array in);
 #endif
 
+#if AF_API_VERSION >= 33
+    /**
+        Function to check if Image IO is available
+
+        \param[out] out is true if ArrayFire was commpiled with ImageIO support,
+        false otherwise.
+
+        \return     \ref AF_SUCCESS if successful
+
+        \ingroup imageio_func_available
+    */
+    AFAPI af_err af_is_image_io_available(bool *out);
+#endif
+
     /**
        C Interface for resizing an image to specified dimensions
 
@@ -829,6 +867,21 @@ extern "C" {
                               const dim_t odim0, const dim_t odim1,
                               const af_interp_type method, const bool inverse);
 
+#if AF_API_VERSION >= 33
+    /**
+       C Interface for transforming an image
+       C++ Interface for transforming coordinates
+
+       \param[out] out the transformed coordinates
+       \param[in] tf is transformation matrix
+       \param[in] d0 is the first input dimension
+       \param[in] d1 is the second input dimension
+
+       \ingroup transform_func_coordinates
+    */
+    AFAPI af_err af_transform_coordinates(af_array *out, const af_array tf, const float d0, const float d1);
+#endif
+
     /**
        C Interface for rotating an image
 
diff --git a/include/af/internal.h b/include/af/internal.h
new file mode 100644
index 0000000000..53002929c3
--- /dev/null
+++ b/include/af/internal.h
@@ -0,0 +1,181 @@
+/*******************************************************
+ * Copyright (c) 2016, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <af/dim4.hpp>
+
+#ifdef __cplusplus
+namespace af
+{
+    class array;
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] data is the raw data pointer.
+       \param[in] offset specifies the number of elements to skip.
+       \param[in] dims specifies the dimensions for the region of interest.
+       \param[in] strides specifies the distance between each element of a given dimension.
+       \param[in] ty specifies the data type of \p data.
+       \param[in] location specifies if the data is on host or the device.
+
+       \note: If \p location is `afHost`, a memory copy is performed.
+
+       \returns an af::array() with specified offset, dimensions and strides.
+
+       \ingroup internal_func_create
+    */
+    AFAPI array createStridedArray(const void *data, const dim_t offset,
+                                   const dim4 dims, const dim4 strides,
+                                   const af::dtype ty,
+                                   const af::source location);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] in An multi dimensional array.
+       \returns af::dim4() containing distance between consecutive elements in each dimension.
+
+       \ingroup internal_func_strides
+    */
+    AFAPI dim4 getStrides(const array &in);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] in An multi dimensional array.
+       \returns offset from the starting location of data pointer specified in number of elements.
+
+       \ingroup internal_func_offset
+    */
+    AFAPI dim_t getOffset(const array &in);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] in An multi dimensional array.
+       \returns Returns the raw pointer location to the array.
+
+       \note This pointer may be shared with other arrays. Use this function with caution.
+
+       \ingroup internal_func_rawptr
+    */
+    AFAPI void *getRawPtr(const array &in);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] in An multi dimensional array.
+       \returns a boolean specifying if all elements in the array are contiguous.
+
+       \ingroup internal_func_linear
+    */
+    AFAPI bool isLinear(const array &in);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] in An multi dimensional array.
+       \returns a boolean specifying if the array owns the raw pointer. It is false if it is a sub array.
+
+       \ingroup internal_func_owner
+    */
+    AFAPI bool isOwner(const array &in);
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[out] arr an af_array with specified offset, dimensions and strides.
+       \param[in] data is the raw data pointer.
+       \param[in] offset specifies the number of elements to skip.
+       \param[in] ndims specifies the number of array dimensions.
+       \param[in] dims specifies the dimensions for the region of interest.
+       \param[in] strides specifies the distance between each element of a given dimension.
+       \param[in] ty specifies the data type of \p data.
+       \param[in] location specifies if the data is on host or the device.
+
+       \note If \p location is `afHost`, a memory copy is performed.
+
+       \ingroup internal_func_create
+    */
+    AFAPI af_err af_create_strided_array(af_array *arr,
+                                         const void *data,
+                                         const dim_t offset,
+                                         const unsigned ndims,
+                                         const dim_t *const dims,
+                                         const dim_t *const strides,
+                                         const af_dtype ty,
+                                         const af_source location);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] arr An multi dimensional array.
+       \param[out] s0 distance between each consecutive element along first  dimension.
+       \param[out] s1 distance between each consecutive element along second dimension.
+       \param[out] s2 distance between each consecutive element along third  dimension.
+       \param[out] s3 distance between each consecutive element along fourth dimension.
+
+       \ingroup internal_func_strides
+    */
+    AFAPI af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array arr);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] arr An multi dimensional array.
+       \param[out] offset: Offset from the starting location of data pointer specified in number of elements. distance between each consecutive element along first  dimension.
+
+       \ingroup internal_func_offset
+    */
+    AFAPI af_err af_get_offset(dim_t *offset, const af_array arr);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] arr An multi dimensional array.
+       \param[out] ptr the raw pointer location to the array.
+
+       \note This pointer may be shared with other arrays. Use this function with caution.
+
+       \ingroup internal_func_rawptr
+    */
+    AFAPI af_err af_get_raw_ptr(void **ptr, const af_array arr);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] arr An multi dimensional array.
+       \param[out] result: a boolean specifying if all elements in the array are contiguous.
+
+       \ingroup internal_func_linear
+    */
+    AFAPI af_err af_is_linear(bool *result, const af_array arr);
+#endif
+
+#if AF_API_VERSION >= 33
+    /**
+       \param[in] arr An multi dimensional array.
+       \param[out] result: a boolean specifying if the array owns the raw pointer. It is false if it is a sub array.
+
+       \ingroup internal_func_owner
+    */
+    AFAPI af_err af_is_owner(bool *result, const af_array arr);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/af/lapack.h b/include/af/lapack.h
index f1cf87ad82..bb54069550 100644
--- a/include/af/lapack.h
+++ b/include/af/lapack.h
@@ -237,6 +237,18 @@ namespace af
     */
     AFAPI double norm(const array &in, const normType type=AF_NORM_EUCLID,
                       const double p=1, const double q=1);
+
+#if AF_API_VERSION >= 33
+    /**
+       Returns true is ArrayFire is compiled with LAPACK support
+
+       \returns true is LAPACK support is available, false otherwise
+
+       \ingroup lapack_ops_func_norm
+    */
+    AFAPI bool isLAPACKAvailable();
+#endif
+
 }
 #endif
 
@@ -425,6 +437,19 @@ extern "C" {
     */
     AFAPI af_err af_norm(double *out, const af_array in, const af_norm_type type, const double p, const double q);
 
+#if AF_API_VERSION >= 33
+    /**
+       Returns true is ArrayFire is compiled with LAPACK support
+
+       \param[out] out is true if LAPACK support is available, false otherwise
+
+       \returns AF_SUCCESS if successful (does not depend on the value of out)
+
+       \ingroup lapack_ops_func_norm
+    */
+    AFAPI af_err af_is_lapack_available(bool *out);
+#endif
+
 
 #ifdef __cplusplus
 }
diff --git a/include/af/opencl.h b/include/af/opencl.h
index 271879fdc9..16b85d763f 100644
--- a/include/af/opencl.h
+++ b/include/af/opencl.h
@@ -7,6 +7,7 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <OpenCL/cl.h>
 #else
@@ -19,6 +20,29 @@
 extern "C" {
 #endif
 
+#if AF_API_VERSION >= 33
+typedef enum
+{
+    AFCL_DEVICE_TYPE_CPU     = CL_DEVICE_TYPE_CPU,
+    AFCL_DEVICE_TYPE_GPU     = CL_DEVICE_TYPE_GPU,
+    AFCL_DEVICE_TYPE_ACC     = CL_DEVICE_TYPE_ACCELERATOR,
+    AFCL_DEVICE_TYPE_UNKNOWN = -1
+} afcl_device_type;
+#endif
+
+#if AF_API_VERSION >= 33
+typedef enum
+{
+    AFCL_PLATFORM_AMD     = 0,
+    AFCL_PLATFORM_APPLE   = 1,
+    AFCL_PLATFORM_INTEL   = 2,
+    AFCL_PLATFORM_NVIDIA  = 3,
+    AFCL_PLATFORM_BEIGNET = 4,
+    AFCL_PLATFORM_POCL    = 5,
+    AFCL_PLATFORM_UNKNOWN = -1
+} afcl_platform;
+#endif
+
 /**
     \ingroup opencl_mat
     @{
@@ -63,6 +87,67 @@ AFAPI af_err afcl_get_device_id(cl_device_id *id);
 AFAPI af_err afcl_set_device_id(cl_device_id id);
 #endif
 
+#if AF_API_VERSION >= 33
+/**
+   Push user provided device control constructs into the ArrayFire device manager pool
+
+   This function should be used only when the user would like ArrayFire to use an
+   user generated OpenCL context and related objects for ArrayFire operations.
+
+   \param[in] dev is the OpenCL device for which user provided context will be used by ArrayFire
+   \param[in] ctx is the user provided OpenCL cl_context to be used by ArrayFire
+   \param[in] que is the user provided OpenCL cl_command_queue to be used by ArrayFire. If this
+                  parameter is NULL, then we create a command queue for the user using the OpenCL
+                  context they provided us.
+
+   \note The cl_* objects are passed onto c++ objects (cl::Device, cl::Context & cl::CommandQueue)
+   that are defined in the `cl.hpp` OpenCL c++ header provided by Khronos Group Inc. Therefore, please
+   be aware of the lifetime of the cl_* objects before passing them to ArrayFire.
+*/
+AFAPI af_err afcl_add_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que);
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   Set active device using cl_context and cl_device_id
+
+   \param[in] dev is the OpenCL device id that is to be set as Active device inside ArrayFire
+   \param[in] ctx is the OpenCL cl_context being used by ArrayFire
+*/
+AFAPI af_err afcl_set_device_context(cl_device_id dev, cl_context ctx);
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   Remove the user provided device control constructs from the ArrayFire device manager pool
+
+   This function should be used only when the user would like ArrayFire to remove an already
+   pushed user generated OpenCL context and related objects.
+
+   \param[in] dev is the OpenCL device id that has to be popped
+   \param[in] ctx is the cl_context object to be removed from ArrayFire pool
+
+   \note Any reference counts incremented for cl_* objects by ArrayFire internally are decremented
+   by this func call and you won't be able to call `afcl_set_device_context` on these objects after
+   this function has been called.
+*/
+AFAPI af_err afcl_delete_device_context(cl_device_id dev, cl_context ctx);
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   Get the type of the current device
+*/
+AFAPI af_err afcl_get_device_type(afcl_device_type *res);
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   Get the platform of the current device
+*/
+AFAPI af_err afcl_get_platform(afcl_platform *res);
+#endif
+
 /**
   @}
 */
@@ -147,6 +232,97 @@ namespace afcl
  }
 #endif
 
+#if AF_API_VERSION >= 33
+/**
+   Push user provided device control constructs into the ArrayFire device manager pool
+
+   This function should be used only when the user would like ArrayFire to use an
+   user generated OpenCL context and related objects for ArrayFire operations.
+
+   \param[in] dev is the OpenCL device for which user provided context will be used by ArrayFire
+   \param[in] ctx is the user provided OpenCL cl_context to be used by ArrayFire
+   \param[in] que is the user provided OpenCL cl_command_queue to be used by ArrayFire. If this
+                  parameter is NULL, then we create a command queue for the user using the OpenCL
+                  context they provided us.
+
+   \note The cl_* objects are passed onto c++ objects (cl::Device, cl::Context & cl::CommandQueue)
+   that are defined in the `cl.hpp` OpenCL c++ header provided by Khronos Group Inc. Therefore, please
+   be aware of the lifetime of the cl_* objects before passing them to ArrayFire.
+*/
+static inline void addDevice(cl_device_id dev, cl_context ctx, cl_command_queue que)
+{
+    af_err err = afcl_add_device_context(dev, ctx, que);
+    if (err!=AF_SUCCESS) throw af::exception("Failed to push user provided device/context to ArrayFire pool");
+}
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   Set active device using cl_context and cl_device_id
+
+   \param[in] dev is the OpenCL device id that is to be set as Active device inside ArrayFire
+   \param[in] ctx is the OpenCL cl_context being used by ArrayFire
+*/
+static inline void setDevice(cl_device_id dev, cl_context ctx)
+{
+    af_err err = afcl_set_device_context(dev, ctx);
+    if (err!=AF_SUCCESS) throw af::exception("Failed to set device based on cl_device_id & cl_context");
+}
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   Remove the user provided device control constructs from the ArrayFire device manager pool
+
+   This function should be used only when the user would like ArrayFire to remove an already
+   pushed user generated OpenCL context and related objects.
+
+   \param[in] dev is the OpenCL device id that has to be popped
+   \param[in] ctx is the cl_context object to be removed from ArrayFire pool
+
+   \note Any reference counts incremented for cl_* objects by ArrayFire internally are decremented
+   by this func call and you won't be able to call `afcl_set_device_context` on these objects after
+   this function has been called.
+*/
+static inline void deleteDevice(cl_device_id dev, cl_context ctx)
+{
+    af_err err = afcl_delete_device_context(dev, ctx);
+    if (err!=AF_SUCCESS) throw af::exception("Failed to remove the requested device from ArrayFire device pool");
+}
+#endif
+
+
+#if AF_API_VERSION >= 33
+ typedef afcl_device_type deviceType;
+ typedef afcl_platform platform;
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   Get the type of the current device
+*/
+static inline deviceType getDeviceType()
+{
+    afcl_device_type res = AFCL_DEVICE_TYPE_UNKNOWN;
+    af_err err = afcl_get_device_type(&res);
+    if (err!=AF_SUCCESS) throw af::exception("Failed to get OpenCL device type");
+    return res;
+}
+#endif
+
+#if AF_API_VERSION >= 33
+/**
+   Get the type of the current device
+*/
+static inline platform getPlatform()
+{
+    afcl_platform res = AFCL_PLATFORM_UNKNOWN;
+    af_err err = afcl_get_platform(&res);
+    if (err!=AF_SUCCESS) throw af::exception("Failed to get OpenCL platform");
+    return res;
+}
+#endif
+
  /**
  Create an af::array object from an OpenCL cl_mem buffer
 
@@ -263,15 +439,15 @@ namespace afcl
      return afcl::array(af::dim4(dim0, dim1, dim2, dim3), buf, type, retain);
  }
 
- /**
+/**
    @}
- */
-
+*/
 }
 
 namespace af
 {
 
+#if !defined(AF_OPENCL)
 template<> AFAPI cl_mem *array::device() const
 {
     cl_mem *mem = new cl_mem;
@@ -279,6 +455,7 @@ template<> AFAPI cl_mem *array::device() const
     if (err != AF_SUCCESS) throw af::exception("Failed to get cl_mem from array object");
     return mem;
 }
+#endif
 
 }
 
diff --git a/include/af/util.h b/include/af/util.h
index c1fd96ab24..eef46f47c9 100644
--- a/include/af/util.h
+++ b/include/af/util.h
@@ -95,7 +95,8 @@ namespace af
 #if AF_API_VERSION >= 31
     /**
         \param[out] output is the pointer to the c-string that will hold the data. The memory for
-        output is allocated by the function. The user is responsible for deleting the memory.
+        output is allocated by the function. The user is responsible for deleting the memory using
+        af::freeHost() or af_free_host().
         \param[in] exp is an expression, generally the name of the array
         \param[in] arr is the input array
         \param[in] precision is the precision length for display
@@ -108,6 +109,24 @@ namespace af
                         const int precision = 4, const bool transpose = true);
 #endif
 
+#if AF_API_VERSION >= 33
+    /**
+        \param[in] exp is an expression, generally the name of the array
+        \param[in] arr is the input array
+        \param[in] precision is the precision length for display
+        \param[in] transpose determines whether or not to transpose the array before storing it in
+        the string
+
+        \return output is the pointer to the c-string that will hold the data. The memory for
+        output is allocated by the function. The user is responsible for deleting the memory using
+        af::freeHost() or af_free_host().
+
+        \ingroup print_func_tostring
+    */
+    AFAPI const char* toString(const char *exp, const array &arr,
+                               const int precision = 4, const bool transpose = true);
+#endif
+
     // Purpose of Addition: "How to add Function" documentation
     AFAPI array exampleFunction(const array& in, const af_someenum_t param);
 }
@@ -229,10 +248,20 @@ extern "C" {
     AFAPI af_err af_example_function(af_array* out, const af_array in, const af_someenum_t param);
 
     ///
-    ///Get the version information of the library
+    /// Get the version information of the library
     ///
     AFAPI af_err af_get_version(int *major, int *minor, int *patch);
 
+
+#if AF_API_VERSION >= 33
+    ///
+    /// Get the revision (commit) information of the library.
+    /// This returns a constant string from compile time and should not be
+    /// freed by the user.
+    ///
+    AFAPI const char *af_get_revision();
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/arrayfire.h b/include/arrayfire.h
index 7d9e75a7b4..60df3176d1 100644
--- a/include/arrayfire.h
+++ b/include/arrayfire.h
@@ -113,6 +113,8 @@
 
      @defgroup lapack_ops_mat Matrix operations
      inverse, det, rank, norm etc.
+
+     @defgroup lapack_helper LAPACK Helper functions
    @}
 
    @defgroup image_mat Image Processing
@@ -207,6 +209,15 @@
 
    @}
 
+   @defgroup internal_func Functions to work with internal array layout
+   @{
+
+     Functions to work with arrayfire's internal data structure.
+
+     Note: The behavior of these functions is not promised to be consistent across versions.
+
+   @}
+
    @defgroup external Interface Functions
    @{
 
diff --git a/src/api/c/array.cpp b/src/api/c/array.cpp
new file mode 100644
index 0000000000..cefdde1d75
--- /dev/null
+++ b/src/api/c/array.cpp
@@ -0,0 +1,96 @@
+/*******************************************************
+ * Copyright (c) 2016, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+#include <handle.hpp>
+#include <ArrayInfo.hpp>
+#include <platform.hpp>
+
+const ArrayInfo&
+getInfo(const af_array arr, bool check)
+{
+    const ArrayInfo *info = static_cast<ArrayInfo*>(reinterpret_cast<void *>(arr));
+
+    if (check && info->getDevId() != detail::getActiveDeviceId()) {
+        AF_ERROR("Input Array not created on current device", AF_ERR_DEVICE);
+    }
+
+    return *info;
+}
+
+af_err af_get_elements(dim_t *elems, const af_array arr)
+{
+    try {
+        // Do not check for device mismatch
+        *elems =  getInfo(arr, false).elements();
+    } CATCHALL
+    return AF_SUCCESS;
+}
+
+af_err af_get_type(af_dtype *type, const af_array arr)
+{
+    try {
+        // Do not check for device mismatch
+        *type = getInfo(arr, false).getType();
+    } CATCHALL
+    return AF_SUCCESS;
+}
+
+af_err af_get_dims(dim_t *d0, dim_t *d1, dim_t *d2, dim_t *d3,
+                   const af_array in)
+{
+    try {
+        // Do not check for device mismatch
+        ArrayInfo info = getInfo(in, false);
+        *d0 = info.dims()[0];
+        *d1 = info.dims()[1];
+        *d2 = info.dims()[2];
+        *d3 = info.dims()[3];
+    }
+    CATCHALL
+    return AF_SUCCESS;
+}
+
+af_err af_get_numdims(unsigned *nd, const af_array in)
+{
+    try {
+        // Do not check for device mismatch
+        ArrayInfo info = getInfo(in, false);
+        *nd = info.ndims();
+    }
+    CATCHALL
+    return AF_SUCCESS;
+}
+
+
+#undef INSTANTIATE
+#define INSTANTIATE(fn1, fn2)                           \
+    af_err fn1(bool *result, const af_array in)         \
+    {                                                   \
+        try {                                           \
+            ArrayInfo info = getInfo(in, false);   \
+            *result = info.fn2();                       \
+        }                                               \
+        CATCHALL                                        \
+            return AF_SUCCESS;                          \
+    }
+
+INSTANTIATE(af_is_empty       , isEmpty       )
+INSTANTIATE(af_is_scalar      , isScalar      )
+INSTANTIATE(af_is_row         , isRow         )
+INSTANTIATE(af_is_column      , isColumn      )
+INSTANTIATE(af_is_vector      , isVector      )
+INSTANTIATE(af_is_complex     , isComplex     )
+INSTANTIATE(af_is_real        , isReal        )
+INSTANTIATE(af_is_double      , isDouble      )
+INSTANTIATE(af_is_single      , isSingle      )
+INSTANTIATE(af_is_realfloating, isRealFloating)
+INSTANTIATE(af_is_floating    , isFloating    )
+INSTANTIATE(af_is_integer     , isInteger     )
+INSTANTIATE(af_is_bool        , isBool        )
+
+#undef INSTANTIATE
diff --git a/src/api/c/assign.cpp b/src/api/c/assign.cpp
index 13fa179da8..8ff37630e8 100644
--- a/src/api/c/assign.cpp
+++ b/src/api/c/assign.cpp
@@ -29,6 +29,7 @@ using std::swap;
 template<typename T>
 Array<T> modDims(const Array<T>& in, const af::dim4 &newDims);
 
+
 template<typename Tout, typename Tin>
 static
 void assign(Array<Tout> &out, const unsigned &ndims, const af_seq *index, const Array<Tin> &in_)
@@ -39,7 +40,7 @@ void assign(Array<Tout> &out, const unsigned &ndims, const af_seq *index, const
     DIM_ASSERT(0, (outDs.ndims()>=iDims.ndims()));
     DIM_ASSERT(0, (outDs.ndims()>=(dim_t)ndims));
 
-    evalArray(out);
+    out.eval();
 
     vector<af_seq> index_(index, index+ndims);
 
@@ -125,7 +126,7 @@ af_err af_assign_seq(af_array *out,
 
         ArrayInfo lInfo = getInfo(lhs);
 
-        if (ndims == 1 && ndims != (dim_t)lInfo.ndims()) {
+        if (ndims == 1 && ndims != lInfo.ndims()) {
             af_array tmp_in, tmp_out;
             AF_CHECK(af_flat(&tmp_in, lhs));
             AF_CHECK(af_assign_seq(&tmp_out, tmp_in, ndims, index, rhs));
@@ -350,10 +351,10 @@ af_err af_assign_gen(af_array *out,
             throw;
         }
         if (is_vector) { AF_CHECK(af_release_array(rhs)); }
+
+        std::swap(*out, output);
     }
     CATCHALL;
 
-    std::swap(*out, output);
-
     return AF_SUCCESS;
 }
diff --git a/src/api/c/data.cpp b/src/api/c/data.cpp
index 4d77fb279e..522eb7dfcb 100644
--- a/src/api/c/data.cpp
+++ b/src/api/c/data.cpp
@@ -539,62 +539,10 @@ af_err af_iota(af_array *result, const unsigned ndims, const dim_t * const dims,
     return AF_SUCCESS;
 }
 
-#undef INSTANTIATE
-#define INSTANTIATE(fn1, fn2)                   \
-    af_err fn1(bool *result, const af_array in) \
-    {                                           \
-        try {                                   \
-            ArrayInfo info = getInfo(in);       \
-            *result = info.fn2();               \
-        }                                       \
-        CATCHALL                                \
-            return AF_SUCCESS;                  \
-    }
-
-INSTANTIATE(af_is_empty       , isEmpty       )
-INSTANTIATE(af_is_scalar      , isScalar      )
-INSTANTIATE(af_is_row         , isRow         )
-INSTANTIATE(af_is_column      , isColumn      )
-INSTANTIATE(af_is_vector      , isVector      )
-INSTANTIATE(af_is_complex     , isComplex     )
-INSTANTIATE(af_is_real        , isReal        )
-INSTANTIATE(af_is_double      , isDouble      )
-INSTANTIATE(af_is_single      , isSingle      )
-INSTANTIATE(af_is_realfloating, isRealFloating)
-INSTANTIATE(af_is_floating    , isFloating    )
-INSTANTIATE(af_is_integer     , isInteger     )
-INSTANTIATE(af_is_bool        , isBool        )
-
-#undef INSTANTIATE
-
-af_err af_get_dims(dim_t *d0, dim_t *d1, dim_t *d2, dim_t *d3,
-                   const af_array in)
-{
-    try {
-        ArrayInfo info = getInfo(in);
-        *d0 = info.dims()[0];
-        *d1 = info.dims()[1];
-        *d2 = info.dims()[2];
-        *d3 = info.dims()[3];
-    }
-    CATCHALL
-    return AF_SUCCESS;
-}
-
-af_err af_get_numdims(unsigned *nd, const af_array in)
-{
-    try {
-        ArrayInfo info = getInfo(in);
-        *nd = info.ndims();
-    }
-    CATCHALL
-    return AF_SUCCESS;
-}
-
 template<typename T>
 static inline void eval(af_array arr)
 {
-    evalArray(getArray<T>(arr));
+    getArray<T>(arr).eval();
     return;
 }
 
diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp
index 28b4cc2c49..abe0b01e32 100644
--- a/src/api/c/device.cpp
+++ b/src/api/c/device.cpp
@@ -15,8 +15,8 @@
 #include <platform.hpp>
 #include <Array.hpp>
 #include <handle.hpp>
-#include <memory.hpp>
 #include "err_common.hpp"
+#include <cstring>
 
 using namespace detail;
 
@@ -38,7 +38,9 @@ af_err af_get_backend_count(unsigned* num_backends)
 
 af_err af_get_available_backends(int* result)
 {
-    *result = getBackend();
+    try {
+        *result = getBackend();
+    } CATCHALL;
     return AF_SUCCESS;
 }
 
@@ -46,18 +48,34 @@ af_err af_get_backend_id(af_backend *result, const af_array in)
 {
     try {
         ARG_ASSERT(1, in != 0);
-        ArrayInfo info = getInfo(in);
+        ArrayInfo info = getInfo(in, false);
         *result = info.getBackendId();
     } CATCHALL;
     return AF_SUCCESS;
 }
 
+af_err af_get_device_id(int *device, const af_array in)
+{
+    try {
+        ARG_ASSERT(1, in != 0);
+        ArrayInfo info = getInfo(in, false);
+        *device = info.getDevId();
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_get_active_backend(af_backend *result)
+{
+    *result = (af_backend)getBackend();
+    return AF_SUCCESS;
+}
+
 af_err af_init()
 {
     try {
         static bool first = true;
         if(first) {
-            getInfo();
+            getDeviceInfo();
             first = false;
         }
     } CATCHALL;
@@ -66,15 +84,23 @@ af_err af_init()
 
 af_err af_info()
 {
-    printf("%s", getInfo().c_str());
+    try {
+        printf("%s", getDeviceInfo().c_str());
+    } CATCHALL;
     return AF_SUCCESS;
 }
 
-af_err af_get_version(int *major, int *minor, int *patch)
+af_err af_info_string(char **str, const bool verbose)
 {
-    *major = AF_VERSION_MAJOR;
-    *minor = AF_VERSION_MINOR;
-    *patch = AF_VERSION_PATCH;
+    try {
+        std::string infoStr = getDeviceInfo();
+        af_alloc_host((void**)str, sizeof(char) * (infoStr.size() + 1));
+
+        // Need to do a deep copy
+        // str.c_str wont cut it
+        infoStr.copy(*str, infoStr.size());
+        (*str)[infoStr.size()] = '\0';
+    } CATCHALL;
 
     return AF_SUCCESS;
 }
@@ -130,198 +156,3 @@ af_err af_sync(const int device)
     } CATCHALL;
     return AF_SUCCESS;
 }
-
-af_err af_device_array(af_array *arr, const void *data,
-                       const unsigned ndims,
-                       const dim_t * const dims,
-                       const af_dtype type)
-{
-    try {
-        AF_CHECK(af_init());
-
-        af_array res;
-
-        DIM_ASSERT(1, ndims >= 1);
-        dim4 d(1, 1, 1, 1);
-        for(unsigned i = 0; i < ndims; i++) {
-            d[i] = dims[i];
-            DIM_ASSERT(3, dims[i] >= 1);
-        }
-
-        switch (type) {
-        case f32: res = getHandle(createDeviceDataArray<float  >(d, data)); break;
-        case f64: res = getHandle(createDeviceDataArray<double >(d, data)); break;
-        case c32: res = getHandle(createDeviceDataArray<cfloat >(d, data)); break;
-        case c64: res = getHandle(createDeviceDataArray<cdouble>(d, data)); break;
-        case s32: res = getHandle(createDeviceDataArray<int    >(d, data)); break;
-        case u32: res = getHandle(createDeviceDataArray<uint   >(d, data)); break;
-        case s64: res = getHandle(createDeviceDataArray<intl   >(d, data)); break;
-        case u64: res = getHandle(createDeviceDataArray<uintl  >(d, data)); break;
-        case s16: res = getHandle(createDeviceDataArray<short  >(d, data)); break;
-        case u16: res = getHandle(createDeviceDataArray<ushort >(d, data)); break;
-        case u8 : res = getHandle(createDeviceDataArray<uchar  >(d, data)); break;
-        case b8 : res = getHandle(createDeviceDataArray<char   >(d, data)); break;
-        default: TYPE_ERROR(4, type);
-        }
-
-        std::swap(*arr, res);
-    } CATCHALL;
-
-    return AF_SUCCESS;
-}
-
-af_err af_get_device_ptr(void **data, const af_array arr)
-{
-    try {
-        af_dtype type = getInfo(arr).getType();
-
-        switch (type) {
-            //FIXME: Perform copy if memory not continuous
-        case f32: *data = getDevicePtr(getArray<float  >(arr)); break;
-        case f64: *data = getDevicePtr(getArray<double >(arr)); break;
-        case c32: *data = getDevicePtr(getArray<cfloat >(arr)); break;
-        case c64: *data = getDevicePtr(getArray<cdouble>(arr)); break;
-        case s32: *data = getDevicePtr(getArray<int    >(arr)); break;
-        case u32: *data = getDevicePtr(getArray<uint   >(arr)); break;
-        case s64: *data = getDevicePtr(getArray<intl   >(arr)); break;
-        case u64: *data = getDevicePtr(getArray<uintl  >(arr)); break;
-        case s16: *data = getDevicePtr(getArray<short  >(arr)); break;
-        case u16: *data = getDevicePtr(getArray<ushort >(arr)); break;
-        case u8 : *data = getDevicePtr(getArray<uchar  >(arr)); break;
-        case b8 : *data = getDevicePtr(getArray<char   >(arr)); break;
-
-        default: TYPE_ERROR(4, type);
-        }
-
-    } CATCHALL;
-
-    return AF_SUCCESS;
-}
-
-template <typename T>
-inline void lockDevicePtr(const af_array arr)
-{
-    memPop<T>((const T *)getArray<T>(arr).get());
-}
-
-af_err af_lock_device_ptr(const af_array arr)
-{
-    try {
-        af_dtype type = getInfo(arr).getType();
-
-        switch (type) {
-        case f32: lockDevicePtr<float  >(arr); break;
-        case f64: lockDevicePtr<double >(arr); break;
-        case c32: lockDevicePtr<cfloat >(arr); break;
-        case c64: lockDevicePtr<cdouble>(arr); break;
-        case s32: lockDevicePtr<int    >(arr); break;
-        case u32: lockDevicePtr<uint   >(arr); break;
-        case s64: lockDevicePtr<intl   >(arr); break;
-        case u64: lockDevicePtr<uintl  >(arr); break;
-        case s16: lockDevicePtr<short  >(arr); break;
-        case u16: lockDevicePtr<ushort >(arr); break;
-        case u8 : lockDevicePtr<uchar  >(arr); break;
-        case b8 : lockDevicePtr<char   >(arr); break;
-        default: TYPE_ERROR(4, type);
-        }
-
-    } CATCHALL;
-
-    return AF_SUCCESS;
-}
-
-template <typename T>
-inline void unlockDevicePtr(const af_array arr)
-{
-    memPush<T>((const T *)getArray<T>(arr).get());
-}
-
-af_err af_unlock_device_ptr(const af_array arr)
-{
-    try {
-        af_dtype type = getInfo(arr).getType();
-
-        switch (type) {
-        case f32: unlockDevicePtr<float  >(arr); break;
-        case f64: unlockDevicePtr<double >(arr); break;
-        case c32: unlockDevicePtr<cfloat >(arr); break;
-        case c64: unlockDevicePtr<cdouble>(arr); break;
-        case s32: unlockDevicePtr<int    >(arr); break;
-        case u32: unlockDevicePtr<uint   >(arr); break;
-        case s64: unlockDevicePtr<intl   >(arr); break;
-        case u64: unlockDevicePtr<uintl  >(arr); break;
-        case s16: unlockDevicePtr<short  >(arr); break;
-        case u16: unlockDevicePtr<ushort >(arr); break;
-        case u8 : unlockDevicePtr<uchar  >(arr); break;
-        case b8 : unlockDevicePtr<char   >(arr); break;
-        default: TYPE_ERROR(4, type);
-        }
-
-    } CATCHALL;
-
-    return AF_SUCCESS;
-}
-
-
-af_err af_alloc_device(void **ptr, const dim_t bytes)
-{
-    try {
-        AF_CHECK(af_init());
-        *ptr = (void *)memAlloc<char>(bytes);
-    } CATCHALL;
-    return AF_SUCCESS;
-}
-
-af_err af_alloc_pinned(void **ptr, const dim_t bytes)
-{
-    try {
-        AF_CHECK(af_init());
-        *ptr = (void *)pinnedAlloc<char>(bytes);
-    } CATCHALL;
-    return AF_SUCCESS;
-}
-
-af_err af_free_device(void *ptr)
-{
-    try {
-        memFree<char>((char *)ptr);
-    } CATCHALL;
-    return AF_SUCCESS;
-}
-
-af_err af_free_pinned(void *ptr)
-{
-    try {
-        pinnedFree<char>((char *)ptr);
-    } CATCHALL;
-    return AF_SUCCESS;
-}
-
-af_err af_device_gc()
-{
-    try {
-        garbageCollect();
-    } CATCHALL;
-    return AF_SUCCESS;
-}
-
-af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers,
-                          size_t *lock_bytes,  size_t *lock_buffers)
-{
-    try {
-        deviceMemoryInfo(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers);
-    } CATCHALL;
-    return AF_SUCCESS;
-}
-
-af_err af_set_mem_step_size(const size_t step_bytes)
-{
-    detail::setMemStepSize(step_bytes);
-    return AF_SUCCESS;
-}
-
-af_err af_get_mem_step_size(size_t *step_bytes)
-{
-    *step_bytes =  detail::getMemStepSize();
-    return AF_SUCCESS;
-}
diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp
index 2b8a441bd3..e95ece6d4b 100644
--- a/src/api/c/err_common.cpp
+++ b/src/api/c/err_common.cpp
@@ -8,8 +8,10 @@
  ********************************************************/
 
 #include <af/exception.h>
+#include <af/device.h>
 #include <err_common.hpp>
 #include <type_util.hpp>
+#include <util.hpp>
 #include <string>
 #include <sstream>
 #include <cstring>
@@ -148,59 +150,15 @@ int DimensionError::getArgIndex() const
     return argIndex;
 }
 
-static const int MAX_ERR_SIZE = 1024;
-static std::string global_err_string;
-
 void
 print_error(const string &msg)
 {
-    const char* perr = getenv("AF_PRINT_ERRORS");
-    if(perr != nullptr) {
-        if(std::strncmp(perr, "0", 1) != 0)
+    std::string perr = getEnvVar("AF_PRINT_ERRORS");
+    if(!perr.empty()) {
+        if(perr != "0")
             fprintf(stderr, "%s\n", msg.c_str());
     }
-    global_err_string = msg;
-}
-
-void af_get_last_error(char **str, dim_t *len)
-{
-    *len = std::min(MAX_ERR_SIZE, (int)global_err_string.size());
-
-    if (*len == 0) {
-        *str = NULL;
-    }
-
-    *str = new char[*len + 1];
-    memcpy(*str, global_err_string.c_str(), *len * sizeof(char));
-
-    (*str)[*len] = '\0';
-    global_err_string = std::string("");
-}
-
-const char *af_err_to_string(const af_err err)
-{
-    switch (err) {
-    case AF_SUCCESS:            return "Success";
-    case AF_ERR_INTERNAL:       return "Internal error";
-    case AF_ERR_NO_MEM:         return "Device out of memory";
-    case AF_ERR_DRIVER:         return "Driver not available or incompatible";
-    case AF_ERR_RUNTIME:        return "Runtime error ";
-    case AF_ERR_INVALID_ARRAY:  return "Invalid array";
-    case AF_ERR_ARG:            return "Invalid input argument";
-    case AF_ERR_SIZE:           return "Invalid input size";
-    case AF_ERR_DIFF_TYPE:      return "Input types are not the same";
-    case AF_ERR_NOT_SUPPORTED:  return "Function not supported";
-    case AF_ERR_NOT_CONFIGURED: return "Function not configured to build";
-    case AF_ERR_TYPE:           return "Function does not support this data type";
-    case AF_ERR_NO_DBL:         return "Double precision not supported for this device";
-    case AF_ERR_LOAD_LIB:       return "Failed to load dynamic library. See http://www.arrayfire.com/docs/unifiedbackend.htm for instructions to set up environment for Unified backend";
-    case AF_ERR_LOAD_SYM:       return "Failed to load symbol";
-    case AF_ERR_ARR_BKND_MISMATCH   :
-                                return "There was a mismatch between an array and the current backend";
-    case AF_ERR_UNKNOWN:
-    default:
-        return "Unknown error";
-    }
+    get_global_error_string() = msg;
 }
 
 af_err processException()
@@ -260,3 +218,9 @@ af_err processException()
 
     return err;
 }
+
+std::string& get_global_error_string()
+{
+    static std::string global_error_string = std::string("");
+    return global_error_string;
+}
diff --git a/src/api/c/err_common.hpp b/src/api/c/err_common.hpp
index c8eb90a7f6..60ef64276b 100644
--- a/src/api/c/err_common.hpp
+++ b/src/api/c/err_common.hpp
@@ -203,3 +203,7 @@ void print_error(const std::string &msg);
                       __AF_FILENAME__, __LINE__,            \
                       "\n", __err);                         \
     } while(0)
+
+
+static const int MAX_ERR_SIZE = 1024;
+std::string& get_global_error_string();
diff --git a/src/api/c/error.cpp b/src/api/c/error.cpp
new file mode 100644
index 0000000000..521ca9bef5
--- /dev/null
+++ b/src/api/c/error.cpp
@@ -0,0 +1,63 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/exception.h>
+#include <af/device.h>
+#include <err_common.hpp>
+#include <string>
+#include <algorithm>
+
+void af_get_last_error(char **str, dim_t *len)
+{
+    std::string &global_error_string = get_global_error_string();
+    dim_t slen = std::min(MAX_ERR_SIZE, (int)global_error_string.size());
+
+    if (len && slen == 0) {
+        *len = 0;
+        *str = NULL;
+        return;
+    }
+
+    af_alloc_host((void**)str, sizeof(char) * (slen + 1));
+    global_error_string.copy(*str, slen);
+
+    (*str)[slen] = '\0';
+    global_error_string = std::string("");
+
+    if(len) *len = slen;
+}
+
+const char *af_err_to_string(const af_err err)
+{
+    switch (err) {
+    case AF_SUCCESS:                return "Success";
+    case AF_ERR_NO_MEM:             return "Device out of memory";
+    case AF_ERR_DRIVER:             return "Driver not available or incompatible";
+    case AF_ERR_RUNTIME:            return "Runtime error ";
+    case AF_ERR_INVALID_ARRAY:      return "Invalid array";
+    case AF_ERR_ARG:                return "Invalid input argument";
+    case AF_ERR_SIZE:               return "Invalid input size";
+    case AF_ERR_TYPE:               return "Function does not support this data type";
+    case AF_ERR_DIFF_TYPE:          return "Input types are not the same";
+    case AF_ERR_BATCH:              return "Invalid batch configuration";
+    case AF_ERR_NOT_SUPPORTED:      return "Function not supported";
+    case AF_ERR_NOT_CONFIGURED:     return "Function not configured to build";
+    case AF_ERR_NONFREE:            return "Function unavailable. "
+                                           "ArrayFire compiled without Non-Free algorithms support";
+    case AF_ERR_NO_DBL:             return "Double precision not supported for this device";
+    case AF_ERR_NO_GFX:             return "Graphics functionality unavailable. "
+                                           "ArrayFire compiled without Graphics support";
+    case AF_ERR_LOAD_LIB:           return "Failed to load dynamic library. ";
+    case AF_ERR_LOAD_SYM:           return "Failed to load symbol";
+    case AF_ERR_ARR_BKND_MISMATCH:  return "There was a mismatch between an array and the current backend";
+    case AF_ERR_INTERNAL:           return "Internal error";
+    case AF_ERR_UNKNOWN:
+    default:                        return "Unknown error";
+    }
+}
diff --git a/src/api/c/flip.cpp b/src/api/c/flip.cpp
index 3d5bf53da8..09cbaf75e4 100644
--- a/src/api/c/flip.cpp
+++ b/src/api/c/flip.cpp
@@ -74,9 +74,9 @@ af_err af_flip(af_array *result, const af_array in, const unsigned dim)
         case u8:     out = flipArray<uchar>   (in, dim);  break;
         default:    TYPE_ERROR(1, in_type);
         }
+        swap(*result, out);
     }
     CATCHALL
 
-    swap(*result, out);
     return AF_SUCCESS;
 }
diff --git a/src/api/c/graphics_common.cpp b/src/api/c/graphics_common.cpp
index 4b50bc046e..dc5a46b5e1 100644
--- a/src/api/c/graphics_common.cpp
+++ b/src/api/c/graphics_common.cpp
@@ -13,12 +13,29 @@
 #include <err_common.hpp>
 #include <backend.hpp>
 #include <platform.hpp>
+#include <util.hpp>
 
 using namespace std;
 
 template<typename T>
 GLenum getGLType() { return GL_FLOAT; }
 
+fg::MarkerType getFGMarker(const af_marker_type af_marker) {
+    fg::MarkerType fg_marker;
+    switch (af_marker) {
+        case AF_MARKER_NONE: fg_marker = fg::FG_NONE; break;
+        case AF_MARKER_POINT: fg_marker = fg::FG_POINT; break;
+        case AF_MARKER_CIRCLE: fg_marker = fg::FG_CIRCLE; break;
+        case AF_MARKER_SQUARE: fg_marker = fg::FG_SQUARE; break;
+        case AF_MARKER_TRIANGLE: fg_marker = fg::FG_TRIANGLE; break;
+        case AF_MARKER_CROSS: fg_marker = fg::FG_CROSS; break;
+        case AF_MARKER_PLUS: fg_marker = fg::FG_PLUS; break;
+        case AF_MARKER_STAR: fg_marker = fg::FG_STAR; break;
+        default: fg_marker = fg::FG_NONE; break;
+    }
+    return fg_marker;
+}
+
 #define INSTANTIATE_GET_FG_TYPE(T, ForgeEnum)\
     template<> fg::dtype getGLType<T>() { return ForgeEnum; }
 
@@ -129,8 +146,8 @@ fg::Window* ForgeManager::getMainWindow(const bool dontCreate)
     static fg::Window* wnd = NULL;
 
     // Define AF_DISABLE_GRAPHICS with any value to disable initialization
-    const char* noGraphicsENV = getenv("AF_DISABLE_GRAPHICS");
-    if(!noGraphicsENV) { // If AF_DISABLE_GRAPHICS is not defined
+    std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS");
+    if(noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined
         if (flag && !dontCreate) {
             wnd = new fg::Window(WIDTH, HEIGHT, "ArrayFire", NULL, true);
             CheckGL("End ForgeManager::getMainWindow");
@@ -161,7 +178,7 @@ fg::Image* ForgeManager::getImage(int w, int h, fg::ChannelFormat mode, fg::dtyp
     return mImgMap[key];
 }
 
-fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype type)
+fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype)
 {
     /* nPoints needs to fall in the range of [0, 2^48]
      * for the ForgeManager to correctly retrieve
@@ -169,18 +186,19 @@ fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype type)
      * is a limitation on how big of an plot graph can be rendered
      * using arrayfire graphics funtionality */
     assert(nPoints <= 2ll<<48);
-    long long key = ((nPoints & _48BIT) << 48) | (type & _16BIT);
+    long long key = ((nPoints & _48BIT) << 48);
+    key |= (((((dtype & 0x000F) << 12) | (ptype & 0x000F)) << 8) | (mtype & 0x000F));
 
     PltMapIter iter = mPltMap.find(key);
     if (iter==mPltMap.end()) {
-        fg::Plot* temp = new fg::Plot(nPoints, type);
+        fg::Plot* temp = new fg::Plot(nPoints, dtype, ptype, mtype);
         mPltMap[key] = temp;
     }
 
     return mPltMap[key];
 }
 
-fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype type)
+fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype)
 {
     /* nPoints needs to fall in the range of [0, 2^48]
      * for the ForgeManager to correctly retrieve
@@ -188,11 +206,12 @@ fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype type)
      * is a limitation on how big of an plot graph can be rendered
      * using arrayfire graphics funtionality */
     assert(nPoints <= 2ll<<48);
-    long long key = ((nPoints & _48BIT) << 48) | (type & _16BIT);
+    long long key = ((nPoints & _48BIT) << 48);
+    key |= (((((dtype & 0x000F) << 12) | (ptype & 0x000F)) << 8) | (mtype & 0x000F));
 
     Plt3MapIter iter = mPlt3Map.find(key);
     if (iter==mPlt3Map.end()) {
-        fg::Plot3* temp = new fg::Plot3(nPoints, type);
+        fg::Plot3* temp = new fg::Plot3(nPoints, dtype, ptype, mtype);
         mPlt3Map[key] = temp;
     }
 
diff --git a/src/api/c/graphics_common.hpp b/src/api/c/graphics_common.hpp
index 082c0c7ba8..53d4629f68 100644
--- a/src/api/c/graphics_common.hpp
+++ b/src/api/c/graphics_common.hpp
@@ -30,6 +30,7 @@ GLenum glForceErrorCheck(const char *msg, const char* file, int line);
 #define ForceCheckGL(msg) glForceErrorCheck(msg, __AF_FILENAME__, __LINE__)
 #define CheckGLSkip(msg)  glErrorSkip      (msg, __AF_FILENAME__, __LINE__)
 
+fg::MarkerType getFGMarker(const af_marker_type af_marker);
 namespace graphics
 {
 
@@ -82,8 +83,8 @@ class ForgeManager
         fg::Font* getFont(const bool dontCreate=false);
         fg::Window* getMainWindow(const bool dontCreate=false);
         fg::Image* getImage(int w, int h, fg::ChannelFormat mode, fg::dtype type);
-        fg::Plot* getPlot(int nPoints, fg::dtype type);
-        fg::Plot3* getPlot3(int nPoints, fg::dtype type);
+        fg::Plot* getPlot(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype);
+        fg::Plot3* getPlot3(int nPoints, fg::dtype dtype,fg::PlotType ptype, fg::MarkerType mtype);
         fg::Histogram* getHistogram(int nBins, fg::dtype type);
         fg::Surface* getSurface(int nX, int nY, fg::dtype type);
 
diff --git a/src/api/c/handle.hpp b/src/api/c/handle.hpp
index 70f17eb18e..ac7b74a193 100644
--- a/src/api/c/handle.hpp
+++ b/src/api/c/handle.hpp
@@ -16,6 +16,8 @@
 #include <copy.hpp>
 #include <cast.hpp>
 
+const ArrayInfo& getInfo(const af_array arr, bool check = true);
+
 template<typename T>
 static const detail::Array<T> &
 getArray(const af_array &arr)
diff --git a/src/api/c/image.cpp b/src/api/c/image.cpp
index 1d3e0970ba..2c523d0947 100644
--- a/src/api/c/image.cpp
+++ b/src/api/c/image.cpp
@@ -141,9 +141,9 @@ af_err af_create_window(af_window *out, const int width, const int height, const
 
         wnd = new fg::Window(width, height, title, mainWnd);
         wnd->setFont(fgMngr.getFont());
+        *out = reinterpret_cast<af_window>(wnd);
     }
     CATCHALL;
-    *out = reinterpret_cast<af_window>(wnd);
     return AF_SUCCESS;
 #else
     AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX);
@@ -264,6 +264,28 @@ af_err af_is_window_closed(bool *out, const af_window wind)
 #endif
 }
 
+af_err af_set_visibility(const af_window wind, const bool is_visible)
+{
+#if defined(WITH_GRAPHICS)
+    if(wind==0) {
+        std::cerr<<"Not a valid window"<<std::endl;
+        return AF_SUCCESS;
+    }
+
+    try {
+        fg::Window* wnd = reinterpret_cast<fg::Window*>(wind);
+        if (is_visible)
+            wnd->show();
+        else
+            wnd->hide();
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+#else
+    AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX);
+#endif
+}
+
 af_err af_destroy_window(const af_window wind)
 {
 #if defined(WITH_GRAPHICS)
diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp
index 746ee69142..5e3f7a59cb 100644
--- a/src/api/c/imageio.cpp
+++ b/src/api/c/imageio.cpp
@@ -24,6 +24,7 @@
 #include <traits.hpp>
 #include <memory.hpp>
 #include <err_common.hpp>
+#include <handle.hpp>
 
 #include <string>
 #include <cstring>
@@ -60,14 +61,15 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP
                     pDst0[indx] = (float) *(src + (x * step + FI_RGBA_RED));
                     pDst1[indx] = (float) *(src + (x * step + FI_RGBA_GREEN));
                     pDst2[indx] = (float) *(src + (x * step + FI_RGBA_BLUE));
+                    if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + FI_RGBA_ALPHA));
                 } else {
                     // Non 8-bit types do not use ordering
                     // See Pixel Access Functions Chapter in FreeImage Doc
                     pDst0[indx] = (float) *(src + (x * step + 0));
                     pDst1[indx] = (float) *(src + (x * step + 1));
                     pDst2[indx] = (float) *(src + (x * step + 2));
+                    if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + 3));
                 }
-                if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + FI_RGBA_ALPHA));
             }
             indx++;
         }
@@ -186,6 +188,9 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor)
             AF_ERROR("FreeImage Error: Bits per channel not supported", AF_ERR_NOT_SUPPORTED);
         }
 
+        // data type
+        FREE_IMAGE_TYPE image_type = FreeImage_GetImageType(pBitmap);
+
         // sizes
         uint fi_w = FreeImage_GetWidth(pBitmap);
         uint fi_h = FreeImage_GetHeight(pBitmap);
@@ -203,21 +208,36 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor)
                 else if(fi_bpc == 16)
                     AF_CHECK((readImage<ushort, AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float,  AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    switch(image_type) {
+                        case FIT_UINT32: AF_CHECK((readImage<uint,  AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_INT32: AF_CHECK((readImage<int,   AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_FLOAT: AF_CHECK((readImage<float,  AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break;
+                    }
             } else if (fi_color == 1) {
                 if(fi_bpc == 8)
                     AF_CHECK((readImage<uchar,  AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 16)
                     AF_CHECK((readImage<ushort, AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float,  AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    switch(image_type) {
+                        case FIT_UINT32: AF_CHECK((readImage<uint,  AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_INT32: AF_CHECK((readImage<int,   AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_FLOAT: AF_CHECK((readImage<float,  AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break;
+                    }
             } else {             //3 channel image
                 if(fi_bpc == 8)
                     AF_CHECK((readImage<uchar,  AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 16)
                     AF_CHECK((readImage<ushort, AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float,  AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    switch(image_type) {
+                        case FIT_UINT32: AF_CHECK((readImage<uint,  AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_INT32: AF_CHECK((readImage<int,   AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_FLOAT: AF_CHECK((readImage<float,  AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break;
+                    }
             }
         } else {                    //output gray irrespective
             if(fi_color == 1) {     //4 channel image
@@ -226,14 +246,24 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor)
                 else if(fi_bpc == 16)
                     AF_CHECK((readImage<ushort, AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    switch(image_type) {
+                        case FIT_UINT32: AF_CHECK((readImage<uint,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_INT32: AF_CHECK((readImage<int,   AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_FLOAT: AF_CHECK((readImage<float,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break;
+                    }
             } else if (fi_color == 3 || fi_color == 4) {
                 if(fi_bpc == 8)
                     AF_CHECK((readImage<uchar,  AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 16)
                     AF_CHECK((readImage<ushort, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float,  AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    switch(image_type) {
+                        case FIT_UINT32: AF_CHECK((readImage<uint,  AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_INT32: AF_CHECK((readImage<int,   AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        case FIT_FLOAT: AF_CHECK((readImage<float,  AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                        default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break;
+                    }
             }
         }
 
@@ -298,6 +328,14 @@ af_err af_save_image(const char* filename, const af_array in_)
             AF_CHECK(af_mul(&in, in_, c255, false));
             AF_CHECK(af_release_array(c255));
             free_in = true;
+        } else if(max_real < 256) {
+            in = in_;
+        } else if (max_real < 65536) {
+            af_array c255 = 0;
+            AF_CHECK(af_constant(&c255, 257.0, info.ndims(), info.dims().get(), f32));
+            AF_CHECK(af_div(&in, in_, c255, false));
+            AF_CHECK(af_release_array(c255));
+            free_in = true;
         } else {
             in = in_;
         }
@@ -333,10 +371,10 @@ af_err af_save_image(const char* filename, const af_array in_)
             // Copy the array into FreeImage buffer
             for (uint y = 0; y < fi_h; ++y) {
                 for (uint x = 0; x < fi_w; ++x) {
-                    *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r
-                    *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g
-                    *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b
-                    *(pDstLine + x * step + 3) = (uchar) pSrc3[indx]; // a
+                    *(pDstLine + x * step + FI_RGBA_RED  ) = (uchar) pSrc0[indx]; // r
+                    *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g
+                    *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b
+                    *(pDstLine + x * step + FI_RGBA_ALPHA) = (uchar) pSrc3[indx]; // a
                     ++indx;
                 }
                 pDstLine -= nDstPitch;
@@ -362,9 +400,9 @@ af_err af_save_image(const char* filename, const af_array in_)
             // Copy the array into FreeImage buffer
             for (uint y = 0; y < fi_h; ++y) {
                 for (uint x = 0; x < fi_w; ++x) {
-                    *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r
-                    *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g
-                    *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b
+                    *(pDstLine + x * step + FI_RGBA_RED  ) = (uchar) pSrc0[indx]; // r
+                    *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g
+                    *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b
                     ++indx;
                 }
                 pDstLine -= nDstPitch;
@@ -602,10 +640,10 @@ af_err af_save_image_memory(void **ptr, const af_array in_, const af_image_forma
             // Copy the array into FreeImage buffer
             for (uint y = 0; y < fi_h; ++y) {
                 for (uint x = 0; x < fi_w; ++x) {
-                    *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b
-                    *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g
-                    *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r
-                    *(pDstLine + x * step + 3) = (uchar) pSrc3[indx]; // a
+                    *(pDstLine + x * step + FI_RGBA_RED  ) = (uchar) pSrc0[indx]; // r
+                    *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g
+                    *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b
+                    *(pDstLine + x * step + FI_RGBA_ALPHA) = (uchar) pSrc3[indx]; // a
                     ++indx;
                 }
                 pDstLine -= nDstPitch;
@@ -631,9 +669,9 @@ af_err af_save_image_memory(void **ptr, const af_array in_, const af_image_forma
             // Copy the array into FreeImage buffer
             for (uint y = 0; y < fi_h; ++y) {
                 for (uint x = 0; x < fi_w; ++x) {
-                    *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b
-                    *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g
-                    *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r
+                    *(pDstLine + x * step + FI_RGBA_RED  ) = (uchar) pSrc0[indx]; // r
+                    *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g
+                    *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b
                     ++indx;
                 }
                 pDstLine -= nDstPitch;
diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp
index d50afefb92..76c53f4ab4 100644
--- a/src/api/c/imageio2.cpp
+++ b/src/api/c/imageio2.cpp
@@ -24,6 +24,7 @@
 #include <traits.hpp>
 #include <memory.hpp>
 #include <err_common.hpp>
+#include <handle.hpp>
 
 #include <string>
 #include <cstring>
@@ -58,14 +59,15 @@ static af_err readImage_t(af_array *rImage, const uchar* pSrcLine, const int nSr
                     pDst0[indx] = (T) *(src + (x * step + FI_RGBA_RED));
                     pDst1[indx] = (T) *(src + (x * step + FI_RGBA_GREEN));
                     pDst2[indx] = (T) *(src + (x * step + FI_RGBA_BLUE));
+                    if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + FI_RGBA_ALPHA));
                 } else {
                     // Non 8-bit types do not use ordering
                     // See Pixel Access Functions Chapter in FreeImage Doc
                     pDst0[indx] = (T) *(src + (x * step + 0));
                     pDst1[indx] = (T) *(src + (x * step + 1));
                     pDst2[indx] = (T) *(src + (x * step + 2));
+                    if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + 3));
                 }
-                if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + FI_RGBA_ALPHA));
             }
             indx++;
         }
@@ -162,6 +164,9 @@ af_err af_load_image_native(af_array *out, const char* filename)
             AF_ERROR("FreeImage Error: Bits per channel not supported", AF_ERR_NOT_SUPPORTED);
         }
 
+        // data type
+        FREE_IMAGE_TYPE image_type = FreeImage_GetImageType(pBitmap);
+
         // sizes
         uint fi_w = FreeImage_GetWidth(pBitmap);
         uint fi_h = FreeImage_GetHeight(pBitmap);
@@ -178,21 +183,36 @@ af_err af_load_image_native(af_array *out, const char* filename)
             else if(fi_bpc == 16)
                 AF_CHECK((readImage_t<ushort, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 32)
-                AF_CHECK((readImage_t<float,  AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                switch(image_type) {
+                    case FIT_UINT32: AF_CHECK((readImage_t<uint, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    case FIT_INT32: AF_CHECK((readImage_t<int, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    case FIT_FLOAT: AF_CHECK((readImage_t<float, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break;
+                }
         } else if (fi_color == 1) {
             if(fi_bpc == 8)
                 AF_CHECK((readImage_t<uchar,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 16)
                 AF_CHECK((readImage_t<ushort, AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 32)
-                AF_CHECK((readImage_t<float,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                switch(image_type) {
+                    case FIT_UINT32: AF_CHECK((readImage_t<uint,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    case FIT_INT32: AF_CHECK((readImage_t<int,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    case FIT_FLOAT: AF_CHECK((readImage_t<float,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break;
+                }
         } else {             //3 channel imag
             if(fi_bpc == 8)
                 AF_CHECK((readImage_t<uchar,  AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 16)
                 AF_CHECK((readImage_t<ushort, AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 32)
-                AF_CHECK((readImage_t<float,  AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                switch(image_type) {
+                    case FIT_UINT32: AF_CHECK((readImage_t<uint,  AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    case FIT_INT32: AF_CHECK((readImage_t<int,  AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    case FIT_FLOAT: AF_CHECK((readImage_t<float,  AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break;
+                    default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break;
+                }
         }
 
         std::swap(*out,rImage);
@@ -236,21 +256,22 @@ static void save_t(T* pDstLine, const af_array in, const dim4 dims, uint nDstPit
     for (uint y = 0; y < fi_h; ++y) {
         for (uint x = 0; x < fi_w; ++x) {
             if(channels == 1) {
-                *(pDstLine + x * step + FI_RGBA_RED) = (T) pSrc0[indx]; // r -> 0
+                *(pDstLine + x * step) = (T) pSrc0[indx]; // r -> 0
             } else if(channels >=3) {
                 if((af_dtype) af::dtype_traits<T>::af_type == u8) {
-                    *(pDstLine + x * step + FI_RGBA_BLUE)  = (T) pSrc2[indx]; // b -> 0
+                    *(pDstLine + x * step + FI_RGBA_RED  ) = (T) pSrc0[indx]; // r -> 0
                     *(pDstLine + x * step + FI_RGBA_GREEN) = (T) pSrc1[indx]; // g -> 1
-                    *(pDstLine + x * step + FI_RGBA_RED)   = (T) pSrc0[indx]; // r -> 2
+                    *(pDstLine + x * step + FI_RGBA_BLUE ) = (T) pSrc2[indx]; // b -> 2
+                    if(channels >= 4) *(pDstLine + x * step + FI_RGBA_ALPHA) = (T) pSrc3[indx]; // a
                 } else {
                     // Non 8-bit types do not use ordering
                     // See Pixel Access Functions Chapter in FreeImage Doc
                     *(pDstLine + x * step + 0) = (T) pSrc0[indx]; // r -> 0
                     *(pDstLine + x * step + 1) = (T) pSrc1[indx]; // g -> 1
                     *(pDstLine + x * step + 2) = (T) pSrc2[indx]; // b -> 2
+                    if(channels >= 4) *(pDstLine + x * step + 3) = (T) pSrc3[indx]; // a
                 }
             }
-            if(channels >= 4) *(pDstLine + x * step + FI_RGBA_ALPHA) = (T) pSrc3[indx]; // a
             ++indx;
         }
         pDstLine = (T*)(((uchar*)pDstLine) - nDstPitch);
@@ -373,6 +394,12 @@ af_err af_save_image_native(const char* filename, const af_array in)
     return AF_SUCCESS;
 }
 
+af_err af_is_image_io_available(bool *out)
+{
+    *out = true;
+    return AF_SUCCESS;
+}
+
 #else   // WITH_FREEIMAGE
 #include <af/image.h>
 #include <stdio.h>
@@ -386,4 +413,10 @@ af_err af_save_image_native(const char* filename, const af_array in)
 {
     AF_RETURN_ERROR("ArrayFire compiled without Image IO (FreeImage) support", AF_ERR_NOT_CONFIGURED);
 }
+
+af_err af_is_image_io_available(bool *out)
+{
+    *out = false;
+    return AF_SUCCESS;
+}
 #endif  // WITH_FREEIMAGE
diff --git a/src/api/c/index.cpp b/src/api/c/index.cpp
index b6eb8ab4cd..4a20ca2b34 100644
--- a/src/api/c/index.cpp
+++ b/src/api/c/index.cpp
@@ -42,7 +42,7 @@ af_err af_index(af_array *result, const af_array in, const unsigned ndims, const
     try {
 
         ArrayInfo iInfo = getInfo(in);
-        if (ndims == 1 && ndims != (dim_t)iInfo.ndims()) {
+        if (ndims == 1 && ndims != iInfo.ndims()) {
             af_array tmp_in;
             AF_CHECK(af_flat(&tmp_in, in));
             AF_CHECK(af_index(result, tmp_in, ndims, index));
@@ -67,10 +67,10 @@ af_err af_index(af_array *result, const af_array in, const unsigned ndims, const
         case u8:     indexArray<uchar>   (out, in, ndims, index);  break;
         default:    TYPE_ERROR(1, in_type);
         }
+        swap(*result, out);
     }
     CATCHALL
 
-    swap(*result, out);
     return AF_SUCCESS;
 }
 
@@ -127,11 +127,9 @@ af_err af_lookup(af_array *out, const af_array in, const af_array indices, const
             case  u8: output = lookup<uchar   >(in, indices, dim); break;
             default : TYPE_ERROR(1, idxType);
         }
+        std::swap(*out, output);
     }
     CATCHALL;
-
-    std::swap(*out, output);
-
     return AF_SUCCESS;
 }
 
@@ -232,3 +230,71 @@ af_err af_index_gen(af_array *out, const af_array in, const dim_t ndims, const a
 
     return AF_SUCCESS;
 }
+
+af_seq af_make_seq(double begin, double end, double step)
+{
+    af_seq seq = {begin, end, step};
+    return seq;
+}
+
+af_err af_create_indexers(af_index_t** indexers)
+{
+    try {
+        af_index_t* out = new af_index_t[4];
+        std::swap(*indexers, out);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim)
+{
+    try {
+        ARG_ASSERT(0, (indexer!=NULL));
+        ARG_ASSERT(1, (idx!=NULL));
+        ARG_ASSERT(2, (dim>=0 && dim<=3));
+        indexer[dim].idx.arr = idx;
+        indexer[dim].isBatch = false;
+        indexer[dim].isSeq   = false;
+    }
+    CATCHALL
+    return AF_SUCCESS;
+}
+
+af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch)
+{
+    try {
+        ARG_ASSERT(0, (indexer!=NULL));
+        ARG_ASSERT(1, (idx!=NULL));
+        ARG_ASSERT(2, (dim>=0 && dim<=3));
+        indexer[dim].idx.seq = *idx;
+        indexer[dim].isBatch = is_batch;
+        indexer[dim].isSeq   = true;
+    }
+    CATCHALL
+    return AF_SUCCESS;
+}
+
+af_err af_set_seq_param_indexer(af_index_t* indexer,
+                              const double begin, const double end, const double step,
+                              const dim_t dim, const bool is_batch)
+{
+    try {
+        ARG_ASSERT(0, (indexer!=NULL));
+        ARG_ASSERT(4, (dim>=0 && dim<=3));
+        indexer[dim].idx.seq = af_make_seq(begin, end, step);
+        indexer[dim].isBatch = is_batch;
+        indexer[dim].isSeq   = true;
+    }
+    CATCHALL
+    return AF_SUCCESS;
+}
+
+af_err af_release_indexers(af_index_t* indexers)
+{
+    try {
+        delete[] indexers;
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+}
diff --git a/src/api/c/internal.cpp b/src/api/c/internal.cpp
new file mode 100644
index 0000000000..47c62c6478
--- /dev/null
+++ b/src/api/c/internal.cpp
@@ -0,0 +1,170 @@
+/*******************************************************
+ * Copyright (c) 2016, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/device.h>
+#include <af/version.h>
+#include <af/internal.h>
+#include <backend.hpp>
+#include <platform.hpp>
+#include <Array.hpp>
+#include <handle.hpp>
+#include "err_common.hpp"
+#include <cstring>
+
+using namespace detail;
+
+af_err af_create_strided_array(af_array *arr,
+                               const void *data,
+                               const dim_t offset,
+                               const unsigned ndims,
+                               const dim_t *const dims_,
+                               const dim_t *const strides_,
+                               const af_dtype ty,
+                               const af_source location)
+{
+    try {
+
+        ARG_ASSERT(2, offset >= 0);
+        ARG_ASSERT(3, ndims >=1 && ndims <= 4);
+        ARG_ASSERT(4, dims_ != NULL);
+        ARG_ASSERT(5, strides_ != NULL);
+        ARG_ASSERT(5, strides_[0] == 1);
+
+        for (int i = 1; i < (int)ndims; i++) {
+            ARG_ASSERT(5, strides_[i] > 0);
+        }
+
+        dim4 dims(ndims, dims_);
+        dim4 strides(ndims, strides_);
+
+        for (int i = ndims; i < 4; i++) {
+            strides[i] = strides[i - 1] * dims[i - 1];
+        }
+
+        bool isdev = location == afDevice;
+
+        af_array res;
+        AF_CHECK(af_init());
+
+        switch (ty) {
+        case f32: res = getHandle(Array<float  >(dims, strides, offset, (float   *)data, isdev)); break;
+        case f64: res = getHandle(Array<double >(dims, strides, offset, (double  *)data, isdev)); break;
+        case c32: res = getHandle(Array<cfloat >(dims, strides, offset, (cfloat  *)data, isdev)); break;
+        case c64: res = getHandle(Array<cdouble>(dims, strides, offset, (cdouble *)data, isdev)); break;
+        case u32: res = getHandle(Array<uint   >(dims, strides, offset, (uint    *)data, isdev)); break;
+        case s32: res = getHandle(Array<int    >(dims, strides, offset, (int     *)data, isdev)); break;
+        case u64: res = getHandle(Array<uintl  >(dims, strides, offset, (uintl   *)data, isdev)); break;
+        case s64: res = getHandle(Array<intl   >(dims, strides, offset, (intl    *)data, isdev)); break;
+        case u16: res = getHandle(Array<ushort >(dims, strides, offset, (ushort  *)data, isdev)); break;
+        case s16: res = getHandle(Array<short  >(dims, strides, offset, (short   *)data, isdev)); break;
+        case b8 : res = getHandle(Array<char   >(dims, strides, offset, (char    *)data, isdev)); break;
+        case u8 : res = getHandle(Array<uchar  >(dims, strides, offset, (uchar   *)data, isdev)); break;
+        default: TYPE_ERROR(6, ty);
+        }
+
+        std::swap(*arr, res);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array in)
+{
+    try {
+        ArrayInfo info = getInfo(in);
+        *s0 = info.strides()[0];
+        *s1 = info.strides()[1];
+        *s2 = info.strides()[2];
+        *s3 = info.strides()[3];
+    }
+    CATCHALL
+    return AF_SUCCESS;
+}
+
+af_err af_get_offset(dim_t *offset, const af_array arr)
+{
+    try {
+
+        dim_t res = getInfo(arr).getOffset();
+        std::swap(*offset, res);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+
+}
+
+af_err af_get_raw_ptr(void **ptr, const af_array arr)
+{
+    try {
+
+        void *res = NULL;
+
+        af_dtype ty = getInfo(arr).getType();
+
+        switch (ty) {
+        case f32: res = (void *)getRawPtr(getArray<float  >(arr)); break;
+        case f64: res = (void *)getRawPtr(getArray<double >(arr)); break;
+        case c32: res = (void *)getRawPtr(getArray<cfloat >(arr)); break;
+        case c64: res = (void *)getRawPtr(getArray<cdouble>(arr)); break;
+        case u32: res = (void *)getRawPtr(getArray<uint   >(arr)); break;
+        case s32: res = (void *)getRawPtr(getArray<int    >(arr)); break;
+        case u64: res = (void *)getRawPtr(getArray<uintl  >(arr)); break;
+        case s64: res = (void *)getRawPtr(getArray<intl   >(arr)); break;
+        case u16: res = (void *)getRawPtr(getArray<ushort >(arr)); break;
+        case s16: res = (void *)getRawPtr(getArray<short  >(arr)); break;
+        case b8 : res = (void *)getRawPtr(getArray<char   >(arr)); break;
+        case u8 : res = (void *)getRawPtr(getArray<uchar  >(arr)); break;
+        default: TYPE_ERROR(6, ty);
+        }
+
+        std::swap(*ptr, res);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_is_linear(bool *result, const af_array arr)
+{
+    try {
+        *result = getInfo(arr).isLinear();
+    }
+    CATCHALL
+    return AF_SUCCESS;
+}
+
+af_err af_is_owner(bool *result, const af_array arr)
+{
+    try {
+
+        bool res = false;
+
+        af_dtype ty = getInfo(arr).getType();
+
+        switch (ty) {
+        case f32: res = (void *)getArray<float  >(arr).isOwner(); break;
+        case f64: res = (void *)getArray<double >(arr).isOwner(); break;
+        case c32: res = (void *)getArray<cfloat >(arr).isOwner(); break;
+        case c64: res = (void *)getArray<cdouble>(arr).isOwner(); break;
+        case u32: res = (void *)getArray<uint   >(arr).isOwner(); break;
+        case s32: res = (void *)getArray<int    >(arr).isOwner(); break;
+        case u64: res = (void *)getArray<uintl  >(arr).isOwner(); break;
+        case s64: res = (void *)getArray<intl   >(arr).isOwner(); break;
+        case u16: res = (void *)getArray<ushort >(arr).isOwner(); break;
+        case s16: res = (void *)getArray<short  >(arr).isOwner(); break;
+        case b8 : res = (void *)getArray<char   >(arr).isOwner(); break;
+        case u8 : res = (void *)getArray<uchar  >(arr).isOwner(); break;
+        default: TYPE_ERROR(6, ty);
+        }
+
+        std::swap(*result, res);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+}
diff --git a/src/api/c/lu.cpp b/src/api/c/lu.cpp
index c6004bc6cf..1d98e02490 100644
--- a/src/api/c/lu.cpp
+++ b/src/api/c/lu.cpp
@@ -95,3 +95,13 @@ af_err af_lu_inplace(af_array *pivot, af_array in, const bool is_lapack_piv)
 
     return AF_SUCCESS;
 }
+
+af_err af_is_lapack_available(bool *out)
+{
+    try {
+        *out = isLAPACKAvailable();
+    }
+    CATCHALL;
+
+    return AF_SUCCESS;
+}
diff --git a/src/api/c/median.cpp b/src/api/c/median.cpp
index 716df78028..50bcad25ee 100644
--- a/src/api/c/median.cpp
+++ b/src/api/c/median.cpp
@@ -37,12 +37,18 @@ static double median(const af_array& in)
 
     Array<T> sortedArr = sort<T, true>(input, 0);
 
+    af_array sarrHandle = getHandle<T>(sortedArr);
+
     double result;
     T resPtr[2];
     af_array res = 0;
-    AF_CHECK(af_index(&res, getHandle<T>(sortedArr), 1, mdSpan));
+    AF_CHECK(af_index(&res, sarrHandle, 1, mdSpan));
     AF_CHECK(af_get_data_ptr((void*)&resPtr, res));
 
+    AF_CHECK(af_release_array(res));
+    AF_CHECK(af_release_array(sarrHandle));
+    AF_CHECK(af_release_array(temp));
+
     if (nElems % 2 == 1) {
         result = resPtr[0];
     } else {
@@ -53,9 +59,6 @@ static double median(const af_array& in)
         }
     }
 
-    AF_CHECK(af_release_array(res));
-    AF_CHECK(af_release_array(temp));
-
     return result;
 }
 
diff --git a/src/api/c/memory.cpp b/src/api/c/memory.cpp
new file mode 100644
index 0000000000..098665ba03
--- /dev/null
+++ b/src/api/c/memory.cpp
@@ -0,0 +1,263 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/device.h>
+#include <af/version.h>
+#include <af/backend.h>
+#include <backend.hpp>
+#include <platform.hpp>
+#include <Array.hpp>
+#include <handle.hpp>
+#include <memory.hpp>
+#include "err_common.hpp"
+#include <cstring>
+
+using namespace detail;
+
+af_err af_device_array(af_array *arr, const void *data,
+                       const unsigned ndims,
+                       const dim_t * const dims,
+                       const af_dtype type)
+{
+    try {
+        AF_CHECK(af_init());
+
+        af_array res;
+
+        DIM_ASSERT(1, ndims >= 1);
+        dim4 d(1, 1, 1, 1);
+        for(unsigned i = 0; i < ndims; i++) {
+            d[i] = dims[i];
+            DIM_ASSERT(3, dims[i] >= 1);
+        }
+
+        switch (type) {
+        case f32: res = getHandle(createDeviceDataArray<float  >(d, data)); break;
+        case f64: res = getHandle(createDeviceDataArray<double >(d, data)); break;
+        case c32: res = getHandle(createDeviceDataArray<cfloat >(d, data)); break;
+        case c64: res = getHandle(createDeviceDataArray<cdouble>(d, data)); break;
+        case s32: res = getHandle(createDeviceDataArray<int    >(d, data)); break;
+        case u32: res = getHandle(createDeviceDataArray<uint   >(d, data)); break;
+        case s64: res = getHandle(createDeviceDataArray<intl   >(d, data)); break;
+        case u64: res = getHandle(createDeviceDataArray<uintl  >(d, data)); break;
+        case s16: res = getHandle(createDeviceDataArray<short  >(d, data)); break;
+        case u16: res = getHandle(createDeviceDataArray<ushort >(d, data)); break;
+        case u8 : res = getHandle(createDeviceDataArray<uchar  >(d, data)); break;
+        case b8 : res = getHandle(createDeviceDataArray<char   >(d, data)); break;
+        default: TYPE_ERROR(4, type);
+        }
+
+        std::swap(*arr, res);
+    } CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+af_err af_get_device_ptr(void **data, const af_array arr)
+{
+    try {
+        af_dtype type = getInfo(arr).getType();
+
+        switch (type) {
+            //FIXME: Perform copy if memory not continuous
+        case f32: *data = getDevicePtr(getArray<float  >(arr)); break;
+        case f64: *data = getDevicePtr(getArray<double >(arr)); break;
+        case c32: *data = getDevicePtr(getArray<cfloat >(arr)); break;
+        case c64: *data = getDevicePtr(getArray<cdouble>(arr)); break;
+        case s32: *data = getDevicePtr(getArray<int    >(arr)); break;
+        case u32: *data = getDevicePtr(getArray<uint   >(arr)); break;
+        case s64: *data = getDevicePtr(getArray<intl   >(arr)); break;
+        case u64: *data = getDevicePtr(getArray<uintl  >(arr)); break;
+        case s16: *data = getDevicePtr(getArray<short  >(arr)); break;
+        case u16: *data = getDevicePtr(getArray<ushort >(arr)); break;
+        case u8 : *data = getDevicePtr(getArray<uchar  >(arr)); break;
+        case b8 : *data = getDevicePtr(getArray<char   >(arr)); break;
+
+        default: TYPE_ERROR(4, type);
+        }
+
+    } CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+template <typename T>
+inline void lockArray(const af_array arr)
+{
+    memLock((void *)getArray<T>(arr).get());
+}
+
+af_err af_lock_device_ptr(const af_array arr)
+{
+    return af_lock_array(arr);
+}
+
+af_err af_lock_array(const af_array arr)
+{
+    try {
+        af_dtype type = getInfo(arr).getType();
+
+        switch (type) {
+        case f32: lockArray<float  >(arr); break;
+        case f64: lockArray<double >(arr); break;
+        case c32: lockArray<cfloat >(arr); break;
+        case c64: lockArray<cdouble>(arr); break;
+        case s32: lockArray<int    >(arr); break;
+        case u32: lockArray<uint   >(arr); break;
+        case s64: lockArray<intl   >(arr); break;
+        case u64: lockArray<uintl  >(arr); break;
+        case s16: lockArray<short  >(arr); break;
+        case u16: lockArray<ushort >(arr); break;
+        case u8 : lockArray<uchar  >(arr); break;
+        case b8 : lockArray<char   >(arr); break;
+        default: TYPE_ERROR(4, type);
+        }
+
+    } CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+template <typename T>
+inline void unlockArray(const af_array arr)
+{
+    memUnlock((void *)getArray<T>(arr).get());
+}
+
+af_err af_unlock_device_ptr(const af_array arr)
+{
+    return af_unlock_array(arr);
+}
+
+af_err af_unlock_array(const af_array arr)
+{
+    try {
+        af_dtype type = getInfo(arr).getType();
+
+        switch (type) {
+        case f32: unlockArray<float  >(arr); break;
+        case f64: unlockArray<double >(arr); break;
+        case c32: unlockArray<cfloat >(arr); break;
+        case c64: unlockArray<cdouble>(arr); break;
+        case s32: unlockArray<int    >(arr); break;
+        case u32: unlockArray<uint   >(arr); break;
+        case s64: unlockArray<intl   >(arr); break;
+        case u64: unlockArray<uintl  >(arr); break;
+        case s16: unlockArray<short  >(arr); break;
+        case u16: unlockArray<ushort >(arr); break;
+        case u8 : unlockArray<uchar  >(arr); break;
+        case b8 : unlockArray<char   >(arr); break;
+        default: TYPE_ERROR(4, type);
+        }
+
+    } CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+
+af_err af_alloc_device(void **ptr, const dim_t bytes)
+{
+    try {
+        AF_CHECK(af_init());
+        *ptr = memAllocUser(bytes);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_alloc_pinned(void **ptr, const dim_t bytes)
+{
+    try {
+        AF_CHECK(af_init());
+        *ptr = (void *)pinnedAlloc<char>(bytes);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_free_device(void *ptr)
+{
+    try {
+        memFreeUser(ptr);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_free_pinned(void *ptr)
+{
+    try {
+        pinnedFree<char>((char *)ptr);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_alloc_host(void **ptr, const dim_t bytes)
+{
+    try {
+        *ptr = malloc(bytes);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_free_host(void *ptr)
+{
+    try {
+        free(ptr);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_print_mem_info(const char *msg, const int device_id)
+{
+    try {
+        int device = device_id;
+        if(device == -1) {
+            device = getActiveDeviceId();
+        }
+
+        if(msg != NULL) ARG_ASSERT(0, strlen(msg) < 256); // 256 character limit on msg
+        ARG_ASSERT(1, device >= 0 && device < getDeviceCount());
+
+        printMemInfo(msg ? msg : "", device);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_device_gc()
+{
+    try {
+        garbageCollect();
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers,
+                          size_t *lock_bytes,  size_t *lock_buffers)
+{
+    try {
+        deviceMemoryInfo(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_set_mem_step_size(const size_t step_bytes)
+{
+    try{
+        detail::setMemStepSize(step_bytes);
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_get_mem_step_size(size_t *step_bytes)
+{
+    try {
+        *step_bytes =  detail::getMemStepSize();
+    } CATCHALL;
+    return AF_SUCCESS;
+}
diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp
index bb156ffc2c..b8f1fafa6c 100644
--- a/src/api/c/moddims.cpp
+++ b/src/api/c/moddims.cpp
@@ -23,7 +23,7 @@ template<typename T>
 Array<T> modDims(const Array<T>& in, const af::dim4 &newDims)
 {
     //FIXME: Figure out a better way
-    evalArray<T>(in);
+    in.eval();
 
     Array<T> Out = in;
 
@@ -32,10 +32,24 @@ Array<T> modDims(const Array<T>& in, const af::dim4 &newDims)
     }
 
     Out.modDims(newDims);
+    Out.setDataDims(newDims);
 
     return Out;
 }
 
+template Array<float> modDims(const Array<float> &in, const af::dim4 &newDims);
+template Array<double> modDims(const Array<double> &in, const af::dim4 &newDims);
+template Array<cfloat> modDims(const Array<cfloat> &in, const af::dim4 &newDims);
+template Array<cdouble> modDims(const Array<cdouble> &in, const af::dim4 &newDims);
+template Array<int> modDims(const Array<int> &in, const af::dim4 &newDims);
+template Array<uint> modDims(const Array<uint> &in, const af::dim4 &newDims);
+template Array<intl> modDims(const Array<intl> &in, const af::dim4 &newDims);
+template Array<uintl> modDims(const Array<uintl> &in, const af::dim4 &newDims);
+template Array<short> modDims(const Array<short> &in, const af::dim4 &newDims);
+template Array<ushort> modDims(const Array<ushort> &in, const af::dim4 &newDims);
+template Array<uchar> modDims(const Array<uchar> &in, const af::dim4 &newDims);
+template Array<char> modDims(const Array<char> &in, const af::dim4 &newDims);
+
 af_err af_moddims(af_array *out, const af_array in,
                   const unsigned ndims, const dim_t * const dims)
 {
diff --git a/src/api/c/plot.cpp b/src/api/c/plot.cpp
index 26b58a8b08..a812947228 100644
--- a/src/api/c/plot.cpp
+++ b/src/api/c/plot.cpp
@@ -27,7 +27,7 @@ using namespace detail;
 using namespace graphics;
 
 template<typename T>
-fg::Plot* setup_plot(const af_array X, const af_array Y)
+fg::Plot* setup_plot(const af_array X, const af_array Y, fg::PlotType type, fg::MarkerType marker)
 {
     Array<T> xIn = getArray<T>(X);
     Array<T> yIn = getArray<T>(Y);
@@ -39,14 +39,19 @@ fg::Plot* setup_plot(const af_array X, const af_array Y)
 
     dim4 rdims(1, 0, 2, 3);
 
-    Array<T> Z = join(1, xIn, yIn);
-    Array<T> P = reorder(Z, rdims);
+    dim_t elements = xIn.elements();
+    dim4 rowDims = dim4(1, elements, 1, 1);
 
-    ArrayInfo Xinfo = getInfo(X);
-    af::dim4 X_dims = Xinfo.dims();
+    // Force the vectors to be row vectors
+    // This ensures we can use join(0,..) and skip reorder
+    xIn.modDims(rowDims);
+    yIn.modDims(rowDims);
+
+    // join along first dimension, skip reorder
+    Array<T> P = join(0, xIn, yIn);
 
     ForgeManager& fgMngr = ForgeManager::getInstance();
-    fg::Plot* plot = fgMngr.getPlot(X_dims.elements(), getGLType<T>());
+    fg::Plot* plot = fgMngr.getPlot(elements, getGLType<T>(), type, marker);
     plot->setColor(1.0, 0.0, 0.0);
     plot->setAxesLimits(xmax, xmin, ymax, ymin);
     plot->setAxesTitles("X Axis", "Y Axis");
@@ -55,11 +60,9 @@ fg::Plot* setup_plot(const af_array X, const af_array Y)
 
     return plot;
 }
-#endif
 
-af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props)
+af_err plotWrapper(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, fg::PlotType type=fg::FG_LINE, fg::MarkerType marker=fg::FG_NONE)
 {
-#if defined(WITH_GRAPHICS)
     if(wind==0) {
         std::cerr<<"Not a valid window"<<std::endl;
         return AF_SUCCESS;
@@ -85,12 +88,12 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co
         fg::Plot* plot = NULL;
 
         switch(Xtype) {
-            case f32: plot = setup_plot<float  >(X, Y); break;
-            case s32: plot = setup_plot<int    >(X, Y); break;
-            case u32: plot = setup_plot<uint   >(X, Y); break;
-            case s16: plot = setup_plot<short  >(X, Y); break;
-            case u16: plot = setup_plot<ushort >(X, Y); break;
-            case u8 : plot = setup_plot<uchar  >(X, Y); break;
+            case f32: plot = setup_plot<float  >(X, Y, type, marker); break;
+            case s32: plot = setup_plot<int    >(X, Y, type, marker); break;
+            case u32: plot = setup_plot<uint   >(X, Y, type, marker); break;
+            case s16: plot = setup_plot<short  >(X, Y, type, marker); break;
+            case u16: plot = setup_plot<ushort >(X, Y, type, marker); break;
+            case u8 : plot = setup_plot<uchar  >(X, Y, type, marker); break;
             default:  TYPE_ERROR(1, Xtype);
         }
 
@@ -101,6 +104,24 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co
     }
     CATCHALL;
     return AF_SUCCESS;
+}
+
+#endif // WITH_GRAPHICS
+
+af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props)
+{
+#if defined(WITH_GRAPHICS)
+    return plotWrapper(wind, X, Y, props);
+#else
+    return AF_ERR_NO_GFX;
+#endif
+}
+
+af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_marker_type af_marker, const af_cell* const props)
+{
+#if defined(WITH_GRAPHICS)
+    fg::MarkerType fg_marker = getFGMarker(af_marker);
+    return plotWrapper(wind, X, Y, props, fg::FG_SCATTER, fg_marker);
 #else
     AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX);
 #endif
diff --git a/src/api/c/plot3.cpp b/src/api/c/plot3.cpp
index 1ef30e657e..2e18251b45 100644
--- a/src/api/c/plot3.cpp
+++ b/src/api/c/plot3.cpp
@@ -30,7 +30,7 @@ using namespace detail;
 using namespace graphics;
 
 template<typename T>
-fg::Plot3* setup_plot3(const af_array P)
+fg::Plot3* setup_plot3(const af_array P, fg::PlotType ptype, fg::MarkerType mtype)
 {
     Array<T> pIn = getArray<T>(P);
     ArrayInfo Pinfo = getInfo(P);
@@ -46,37 +46,27 @@ fg::Plot3* setup_plot3(const af_array P)
         P_dims = pIn.dims();
     }
 
-    T max[3], min[3];
-    if(P_dims[0] == 3) {
-        af_get_data_ptr(max, getHandle(reduce<af_max_t, T, T>(pIn, 1)));
-        af_get_data_ptr(min, getHandle(reduce<af_min_t, T, T>(pIn, 1)));
+    if(P_dims[1] == 3){
+        pIn = transpose(pIn, false);
     }
 
-    if(P_dims[1] == 3) {
-        af_get_data_ptr(max, getHandle(reduce<af_max_t, T, T>(pIn, 0)));
-        af_get_data_ptr(min, getHandle(reduce<af_min_t, T, T>(pIn, 0)));
-    }
+    T max[3], min[3];
+    copyData(max, reduce<af_max_t, T, T>(pIn, 1));
+    copyData(min, reduce<af_min_t, T, T>(pIn, 1));
 
     ForgeManager& fgMngr = ForgeManager::getInstance();
-    fg::Plot3* plot3 = fgMngr.getPlot3(P_dims.elements()/3, getGLType<T>());
+    fg::Plot3* plot3 = fgMngr.getPlot3(P_dims.elements()/3, getGLType<T>(), ptype, mtype);
     plot3->setColor(1.0, 0.0, 0.0);
     plot3->setAxesLimits(max[0], min[0],
                          max[1], min[1],
                          max[2], min[2]);
     plot3->setAxesTitles("X Axis", "Y Axis", "Z Axis");
-
-    if(P_dims[1] == 3){
-        pIn = transpose(pIn, false);
-    }
     copy_plot3<T>(pIn, plot3);
-
     return plot3;
 }
-#endif
 
-af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props)
+af_err plot3Wrapper(const af_window wind, const af_array P, const af_cell* const props, const fg::PlotType type=fg::FG_LINE, const fg::MarkerType marker=fg::FG_NONE)
 {
-#if defined(WITH_GRAPHICS)
     if(wind==0) {
         std::cerr<<"Not a valid window"<<std::endl;
         return AF_SUCCESS;
@@ -91,12 +81,12 @@ af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* cons
         fg::Plot3* plot3 = NULL;
 
         switch(Ptype) {
-            case f32: plot3 = setup_plot3<float >(P); break;
-            case s32: plot3 = setup_plot3<int   >(P); break;
-            case u32: plot3 = setup_plot3<uint  >(P); break;
-            case s16: plot3 = setup_plot3<short >(P); break;
-            case u16: plot3 = setup_plot3<ushort>(P); break;
-            case u8 : plot3 = setup_plot3<uchar >(P); break;
+            case f32: plot3 = setup_plot3<float >(P, type, marker); break;
+            case s32: plot3 = setup_plot3<int   >(P, type, marker); break;
+            case u32: plot3 = setup_plot3<uint  >(P, type, marker); break;
+            case s16: plot3 = setup_plot3<short >(P, type, marker); break;
+            case u16: plot3 = setup_plot3<ushort>(P, type, marker); break;
+            case u8 : plot3 = setup_plot3<uchar >(P, type, marker); break;
             default:  TYPE_ERROR(1, Ptype);
         }
 
@@ -107,6 +97,24 @@ af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* cons
     }
     CATCHALL;
     return AF_SUCCESS;
+}
+
+#endif // WITH_GRAPHICS
+
+af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props)
+{
+#if defined(WITH_GRAPHICS)
+    return plot3Wrapper(wind, P, props);
+#else
+    return AF_ERR_NO_GFX;
+#endif
+}
+
+af_err af_draw_scatter3(const af_window wind, const af_array P, const af_marker_type af_marker, const af_cell* const props)
+{
+#if defined(WITH_GRAPHICS)
+    fg::MarkerType fg_marker = getFGMarker(af_marker);
+    return plot3Wrapper(wind, P, props, fg::FG_SCATTER, fg_marker);
 #else
     AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX);
 #endif
diff --git a/src/api/c/print.cpp b/src/api/c/print.cpp
index d2d9921654..66133503ef 100644
--- a/src/api/c/print.cpp
+++ b/src/api/c/print.cpp
@@ -14,12 +14,14 @@
 #include <sstream>
 #include <af/array.h>
 #include <af/data.h>
+#include <af/internal.h>
 #include <copy.hpp>
 #include <print.hpp>
 #include <ArrayInfo.hpp>
 #include <err_common.hpp>
 #include <backend.hpp>
 #include <type_util.hpp>
+#include <handle.hpp>
 
 #include <af/index.h>
 
@@ -69,7 +71,7 @@ static void print(const char *exp, af_array arr, const int precision, std::ostre
 
     os << "[" << info.dims() << "]\n";
 #ifndef NDEBUG
-    os <<"   Offsets: [" << info.offsets() << "]" << std::endl;
+    os <<"   Offset: " << info.getOffset() << std::endl;
     os <<"   Strides: [" << info.strides() << "]" << std::endl;
 #endif
 
@@ -180,8 +182,8 @@ af_err af_array_to_string(char **output, const char *exp, const af_array arr,
         default:    TYPE_ERROR(1, type);
         }
         std::string str = ss.str();
-        *output = new char[str.size() + 1];
-        std::copy(str.begin(), str.end(), *output);
+        af_alloc_host((void**)output, sizeof(char) * (str.size() + 1));
+        str.copy(*output, str.size());
         (*output)[str.size()] = '\0'; // don't forget the terminating 0
     }
     CATCHALL;
diff --git a/src/api/c/set.cpp b/src/api/c/set.cpp
index 1643fad95b..db9b5782e5 100644
--- a/src/api/c/set.cpp
+++ b/src/api/c/set.cpp
@@ -28,7 +28,9 @@ af_err af_set_unique(af_array *out, const af_array in, const bool is_sorted)
 {
     try {
 
-        af_dtype type = getInfo(in).getType();
+        ArrayInfo in_info = getInfo(in);
+        ARG_ASSERT(1, in_info.isVector());
+        af_dtype type = in_info.getType();
 
         af_array res;
         switch(type) {
@@ -62,8 +64,14 @@ af_err af_set_union(af_array *out, const af_array first, const af_array second,
 {
     try {
 
-        af_dtype first_type = getInfo(first).getType();
-        af_dtype second_type = getInfo(second).getType();
+        ArrayInfo first_info = getInfo(first);
+        ArrayInfo second_info = getInfo(second);
+
+        ARG_ASSERT(1, first_info.isVector());
+        ARG_ASSERT(1, second_info.isVector());
+
+        af_dtype first_type = first_info.getType();
+        af_dtype second_type = second_info.getType();
 
         ARG_ASSERT(1, first_type == second_type);
 
@@ -98,8 +106,14 @@ af_err af_set_intersect(af_array *out, const af_array first, const af_array seco
 {
     try {
 
-        af_dtype first_type = getInfo(first).getType();
-        af_dtype second_type = getInfo(second).getType();
+        ArrayInfo first_info = getInfo(first);
+        ArrayInfo second_info = getInfo(second);
+
+        ARG_ASSERT(1, first_info.isVector());
+        ARG_ASSERT(1, second_info.isVector());
+
+        af_dtype first_type = first_info.getType();
+        af_dtype second_type = second_info.getType();
 
         ARG_ASSERT(1, first_type == second_type);
 
diff --git a/src/api/c/sift.cpp b/src/api/c/sift.cpp
index c7a38582aa..a14badc88d 100644
--- a/src/api/c/sift.cpp
+++ b/src/api/c/sift.cpp
@@ -54,7 +54,7 @@ af_err af_sift(af_features* feat, af_array* desc, const af_array in, const unsig
                const bool double_input, const float img_scale, const float feature_ratio)
 {
     try {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
         ArrayInfo info = getInfo(in);
         af::dim4 dims  = info.dims();
 
@@ -95,7 +95,7 @@ af_err af_gloh(af_features* feat, af_array* desc, const af_array in, const unsig
                const bool double_input, const float img_scale, const float feature_ratio)
 {
     try {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
         ArrayInfo info = getInfo(in);
         af::dim4 dims  = info.dims();
 
diff --git a/src/api/c/stream.cpp b/src/api/c/stream.cpp
index a7b5771ee0..17cc945520 100644
--- a/src/api/c/stream.cpp
+++ b/src/api/c/stream.cpp
@@ -249,12 +249,17 @@ static af_array checkVersionAndRead(const char *filename, const unsigned index)
 {
     char version = 0;
 
-    std::fstream fs(filename, std::fstream::in | std::fstream::binary);
+    std::string filenameStr = std::string(filename);
+    std::fstream fs(filenameStr, std::fstream::in | std::fstream::binary);
     // Throw exception if file is not open
-    if(!fs.is_open()) AF_ERROR("File failed to open", AF_ERR_ARG);
+    if(!fs.is_open()) {
+        std::string errStr = "Failed to open: " + filenameStr;
+        AF_ERROR(errStr.c_str(), AF_ERR_ARG);
+    }
 
     if(fs.peek() == std::fstream::traits_type::eof()) {
-        AF_ERROR("File is empty", AF_ERR_ARG);
+        std::string errStr = filenameStr + " is empty";
+        AF_ERROR(errStr.c_str(), AF_ERR_ARG);
     } else {
         fs.read(&version, sizeof(char));
     }
@@ -270,13 +275,18 @@ int checkVersionAndFindIndex(const char *filename, const char *k)
 {
     char version = 0;
     std::string key(k);
+    std::string filenameStr(filename);
+    std::ifstream fs(filenameStr, std::ifstream::in | std::ifstream::binary);
 
-    std::ifstream fs(filename, std::ifstream::in | std::ifstream::binary);
     // Throw exception if file is not open
-    if(!fs.is_open()) AF_ERROR("File failed to open", AF_ERR_ARG);
+    if(!fs.is_open()) {
+        std::string errStr = "Failed to open: " + filenameStr;
+        AF_ERROR(errStr.c_str(), AF_ERR_ARG);
+    }
 
     if(fs.peek() == std::ifstream::traits_type::eof()) {
-        AF_ERROR("File is empty", AF_ERR_ARG);
+        std::string errStr = filenameStr + " is empty";
+        AF_ERROR(errStr.c_str(), AF_ERR_ARG);
     } else {
         fs.read(&version, sizeof(char));
     }
diff --git a/src/api/c/surface.cpp b/src/api/c/surface.cpp
index 7db8441163..2394f5f96c 100644
--- a/src/api/c/surface.cpp
+++ b/src/api/c/surface.cpp
@@ -49,21 +49,29 @@ fg::Surface* setup_surface(const af_array xVals, const af_array yVals, const af_
     af::dim4 Y_dims = Yinfo.dims();
     af::dim4 Z_dims = Zinfo.dims();
 
-    dim4   rdims(1, 0, 2, 3);
-    dim4 x_tdims(1, Y_dims[0], 1, 1);
-    dim4 y_tdims(1, X_dims[0], 1, 1);
     if(Xinfo.isVector()){
+        // Convert xIn is a column vector
+        xIn.modDims(xIn.elements());
+        // Now tile along second dimension
+        dim4 x_tdims(1, Y_dims[0], 1, 1);
         xIn = tile(xIn, x_tdims);
+
+        // Convert yIn to a row vector
+        yIn.modDims(af::dim4(1, yIn.elements()));
+        // Now tile along first dimension
+        dim4 y_tdims(X_dims[0], 1, 1, 1);
         yIn = tile(yIn, y_tdims);
-        yIn = reorder(yIn, rdims);
     }
 
-    xIn.modDims(xIn.elements());
-    yIn.modDims(yIn.elements());
-    zIn.modDims(zIn.elements());
-    Array<T> Z = join(1, join(1, xIn, yIn), zIn);
-    Z = reorder(Z, rdims);
-    Z.modDims(Z.elements());
+    // Flatten xIn, yIn and zIn into row vectors
+    dim4 rowDims = dim4(1, zIn.elements());
+    xIn.modDims(rowDims);
+    yIn.modDims(rowDims);
+    zIn.modDims(rowDims);
+
+    // Now join along first dimension, skip reorder
+    std::vector<Array<T> > inputs{xIn, yIn, zIn};
+    Array<T> Z = join(0, inputs);
 
     ForgeManager& fgMngr = ForgeManager::getInstance();
     fg::Surface* surface = fgMngr.getSurface(Z_dims[0], Z_dims[1], getGLType<T>());
diff --git a/src/api/c/transform.cpp b/src/api/c/transform.cpp
index bacb008c78..785a05438e 100644
--- a/src/api/c/transform.cpp
+++ b/src/api/c/transform.cpp
@@ -20,9 +20,9 @@ using namespace detail;
 
 template<typename T>
 static inline af_array transform(const af_array in, const af_array tf, const af::dim4 &odims,
-                                 const af_interp_type method, const bool inverse)
+                                 const af_interp_type method, const bool inverse, const bool perspective)
 {
-    return getHandle(transform<T>(getArray<T>(in), getArray<float>(tf), odims, method, inverse));
+    return getHandle(transform<T>(getArray<T>(in), getArray<float>(tf), odims, method, inverse, perspective));
 }
 
 af_err af_transform(af_array *out, const af_array in, const af_array tf,
@@ -41,10 +41,12 @@ af_err af_transform(af_array *out, const af_array in, const af_array tf,
         ARG_ASSERT(5, method == AF_INTERP_NEAREST  ||
                       method == AF_INTERP_BILINEAR ||
                       method == AF_INTERP_LOWER);
-        DIM_ASSERT(2, (tdims[0] == 3 && tdims[1] == 2));
+        DIM_ASSERT(2, (tdims[0] == 3 && (tdims[1] == 2 || tdims[1] == 3)));
         DIM_ASSERT(1, idims.elements() > 0);
         DIM_ASSERT(1, (idims.ndims() == 2 || idims.ndims() == 3));
 
+        const bool perspective = (tdims[1] == 3);
+
         dim_t o0 = odim0, o1 = odim1;
         dim_t o2 = idims[2] * tdims[2];
         if (odim0 * odim1 == 0) {
@@ -55,18 +57,18 @@ af_err af_transform(af_array *out, const af_array in, const af_array tf,
 
         af_array output = 0;
         switch(itype) {
-            case f32: output = transform<float  >(in, tf, odims, method, inverse);  break;
-            case f64: output = transform<double >(in, tf, odims, method, inverse);  break;
-            case c32: output = transform<cfloat >(in, tf, odims, method, inverse);  break;
-            case c64: output = transform<cdouble>(in, tf, odims, method, inverse);  break;
-            case s32: output = transform<int    >(in, tf, odims, method, inverse);  break;
-            case u32: output = transform<uint   >(in, tf, odims, method, inverse);  break;
-            case s64: output = transform<intl   >(in, tf, odims, method, inverse);  break;
-            case u64: output = transform<uintl  >(in, tf, odims, method, inverse);  break;
-            case s16: output = transform<short  >(in, tf, odims, method, inverse);  break;
-            case u16: output = transform<ushort >(in, tf, odims, method, inverse);  break;
-            case u8:  output = transform<uchar  >(in, tf, odims, method, inverse);  break;
-            case b8:  output = transform<char   >(in, tf, odims, method, inverse);  break;
+            case f32: output = transform<float  >(in, tf, odims, method, inverse, perspective);  break;
+            case f64: output = transform<double >(in, tf, odims, method, inverse, perspective);  break;
+            case c32: output = transform<cfloat >(in, tf, odims, method, inverse, perspective);  break;
+            case c64: output = transform<cdouble>(in, tf, odims, method, inverse, perspective);  break;
+            case s32: output = transform<int    >(in, tf, odims, method, inverse, perspective);  break;
+            case u32: output = transform<uint   >(in, tf, odims, method, inverse, perspective);  break;
+            case s64: output = transform<intl   >(in, tf, odims, method, inverse, perspective);  break;
+            case u64: output = transform<uintl  >(in, tf, odims, method, inverse, perspective);  break;
+            case s16: output = transform<short  >(in, tf, odims, method, inverse, perspective);  break;
+            case u16: output = transform<ushort >(in, tf, odims, method, inverse, perspective);  break;
+            case u8:  output = transform<uchar  >(in, tf, odims, method, inverse, perspective);  break;
+            case b8:  output = transform<char   >(in, tf, odims, method, inverse, perspective);  break;
             default:  TYPE_ERROR(1, itype);
         }
         std::swap(*out,output);
diff --git a/src/api/c/transform_coordinates.cpp b/src/api/c/transform_coordinates.cpp
new file mode 100644
index 0000000000..79b448db5d
--- /dev/null
+++ b/src/api/c/transform_coordinates.cpp
@@ -0,0 +1,96 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <af/vision.h>
+#include <af/image.h>
+#include <af/arith.h>
+#include <af/blas.h>
+#include <af/data.h>
+#include <err_common.hpp>
+#include <backend.hpp>
+#include <handle.hpp>
+#include <convolve.hpp>
+#include <arith.hpp>
+
+using af::dim4;
+using namespace detail;
+
+template<typename T>
+static af_array transform_coordinates(const af_array& tf, const float d0, const float d1)
+{
+    dim_t in_dims[2] = { 4, 3 };
+    T h_in[4*3] = { (T)0, (T)0,  (T)d1, (T)d1,
+                    (T)0, (T)d0, (T)d0, (T)0,
+                    (T)1, (T)1,  (T)1,  (T)1 };
+
+    af_array in  = 0;
+    af_array w   = 0;
+    af_array tmp = 0;
+    af_array xt  = 0;
+    af_array yt  = 0;
+    af_array t   = 0;
+
+    AF_CHECK(af_create_array(&in, h_in, 2, in_dims, (af_dtype) af::dtype_traits<T>::af_type));
+
+    af_array tfIdx = 0;
+    af_index_t tfIndexs[2];
+    tfIndexs[0].isSeq = true;
+    tfIndexs[1].isSeq = true;
+    tfIndexs[0].idx.seq = af_make_seq(0, 2, 1);
+    tfIndexs[1].idx.seq = af_make_seq(2, 2, 1);
+    AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs));
+
+    AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE));
+    T h_w[4] = { 1, 1, 1, 1 };
+    dim_t w_dims = 4;
+    AF_CHECK(af_create_array(&w, h_w, 1, &w_dims, (af_dtype) af::dtype_traits<T>::af_type));
+    AF_CHECK(af_div(&w, w, tmp, false));
+
+    tfIndexs[1].idx.seq = af_make_seq(0, 0, 1);
+    AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs));
+    AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE));
+    AF_CHECK(af_mul(&xt, tmp, w, false));
+
+    tfIndexs[1].idx.seq = af_make_seq(1, 1, 1);
+    AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs));
+    AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE));
+    AF_CHECK(af_mul(&yt, tmp, w, false));
+
+    AF_CHECK(af_join(&t, 1, xt, yt));
+
+    AF_CHECK(af_release_array(w));
+    AF_CHECK(af_release_array(tmp));
+    AF_CHECK(af_release_array(xt));
+    AF_CHECK(af_release_array(yt));
+
+    return t;
+}
+
+af_err af_transform_coordinates(af_array *out, const af_array tf, const float d0, const float d1)
+{
+    try {
+        ArrayInfo tfInfo = getInfo(tf);
+        dim4 tfDims = tfInfo.dims();
+        ARG_ASSERT(1, (tfDims[0]==3 && tfDims[1]==3 && tfDims.ndims()==2));
+
+        af_array output;
+        af_dtype type  = tfInfo.getType();
+        switch(type) {
+            case f32: output = transform_coordinates<float >(tf, d0, d1); break;
+            case f64: output = transform_coordinates<double>(tf, d0, d1); break;
+            default : TYPE_ERROR(1, type);
+        }
+        std::swap(*out, output);
+    }
+    CATCHALL;
+
+    return AF_SUCCESS;
+}
diff --git a/src/api/c/util.cpp b/src/api/c/util.cpp
deleted file mode 100644
index cc9a07ac4f..0000000000
--- a/src/api/c/util.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <af/index.h>
-// The following should be included using double quotes
-// to enable it's use in unified wrapper
-#include "err_common.hpp"
-
-af_seq af_make_seq(double begin, double end, double step)
-{
-    af_seq seq = {begin, end, step};
-    return seq;
-}
-
-af_err af_create_indexers(af_index_t** indexers)
-{
-    try {
-        af_index_t* out = new af_index_t[4];
-        std::swap(*indexers, out);
-    }
-    CATCHALL;
-    return AF_SUCCESS;
-}
-
-af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim)
-{
-    ARG_ASSERT(0, (indexer!=NULL));
-    ARG_ASSERT(1, (idx!=NULL));
-    ARG_ASSERT(2, (dim>=0 && dim<=3));
-    try {
-        indexer[dim].idx.arr = idx;
-        indexer[dim].isBatch = false;
-        indexer[dim].isSeq   = false;
-    }
-    CATCHALL
-        return AF_SUCCESS;
-}
-
-af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch)
-{
-    ARG_ASSERT(0, (indexer!=NULL));
-    ARG_ASSERT(1, (idx!=NULL));
-    ARG_ASSERT(2, (dim>=0 && dim<=3));
-    try {
-        indexer[dim].idx.seq = *idx;
-        indexer[dim].isBatch = is_batch;
-        indexer[dim].isSeq   = true;
-    }
-    CATCHALL
-        return AF_SUCCESS;
-}
-
-af_err af_set_seq_param_indexer(af_index_t* indexer,
-                              const double begin, const double end, const double step,
-                              const dim_t dim, const bool is_batch)
-{
-    ARG_ASSERT(0, (indexer!=NULL));
-    ARG_ASSERT(4, (dim>=0 && dim<=3));
-    try {
-        indexer[dim].idx.seq = af_make_seq(begin, end, step);
-        indexer[dim].isBatch = is_batch;
-        indexer[dim].isSeq   = true;
-    }
-    CATCHALL
-        return AF_SUCCESS;
-}
-
-af_err af_release_indexers(af_index_t* indexers)
-{
-    try {
-        delete[] indexers;
-    }
-    CATCHALL;
-    return AF_SUCCESS;
-}
diff --git a/src/api/c/version.cpp b/src/api/c/version.cpp
new file mode 100644
index 0000000000..91d24cb823
--- /dev/null
+++ b/src/api/c/version.cpp
@@ -0,0 +1,25 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/util.h>
+#include <version.hpp>
+
+af_err af_get_version(int *major, int *minor, int *patch)
+{
+    *major = AF_VERSION_MAJOR;
+    *minor = AF_VERSION_MINOR;
+    *patch = AF_VERSION_PATCH;
+
+    return AF_SUCCESS;
+}
+
+const char *af_get_revision()
+{
+    return AF_REVISION;
+}
diff --git a/src/api/cpp/array.cpp b/src/api/cpp/array.cpp
index f7931cfa9f..b993e2f7e8 100644
--- a/src/api/cpp/array.cpp
+++ b/src/api/cpp/array.cpp
@@ -1057,11 +1057,11 @@ af::dtype implicit_dtype(af::dtype scalar_type, af::dtype array_type)
 
     void array::lock() const
     {
-        AF_THROW(af_lock_device_ptr(get()));
+        AF_THROW(af_lock_array(get()));
     }
 
     void array::unlock() const
     {
-        AF_THROW(af_unlock_device_ptr(get()));
+        AF_THROW(af_unlock_array(get()));
     }
 }
diff --git a/src/api/cpp/device.cpp b/src/api/cpp/device.cpp
index bec0a60d59..faf0b0e7dd 100644
--- a/src/api/cpp/device.cpp
+++ b/src/api/cpp/device.cpp
@@ -42,11 +42,32 @@ namespace af
         return result;
     }
 
+    int getDeviceId(const array &in)
+    {
+        int device = getDevice();;
+        AF_THROW(af_get_device_id(&device, in.get()));
+        return device;
+    }
+
+    af::Backend getActiveBackend()
+    {
+        af::Backend result = (af::Backend)0;
+        AF_THROW(af_get_active_backend(&result));
+        return result;
+    }
+
     void info()
     {
         AF_THROW(af_info());
     }
 
+    const char* infoString(const bool verbose)
+    {
+        char *str = NULL;
+        AF_THROW(af_info_string(&str, verbose));
+        return (const char *)str;
+    }
+
     void deviceprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute)
     {
         deviceInfo(d_name, d_platform, d_toolkit, d_compute);
@@ -140,6 +161,23 @@ namespace af
         AF_THROW(af_free_pinned((void *)ptr));
     }
 
+    void *allocHost(const size_t elements, const af::dtype type)
+    {
+        void *ptr;
+        AF_THROW(af_alloc_host(&ptr, elements * size_of(type)));
+        return ptr;
+    }
+
+    void freeHost(const void *ptr)
+    {
+        AF_THROW(af_free_host((void *)ptr));
+    }
+
+    void printMemInfo(const char *msg, const int device_id)
+    {
+        AF_THROW(af_print_mem_info(msg, device_id));
+    }
+
     void deviceGC()
     {
         AF_THROW(af_device_gc());
@@ -164,16 +202,21 @@ namespace af
         return size_bytes;
     }
 
-#define INSTANTIATE(T)                                                  \
-    template<> AFAPI                                                    \
-    T* alloc(const size_t elements)                                     \
-    {                                                                   \
-        return (T*)alloc(elements, (af::dtype)dtype_traits<T>::af_type); \
-    }                                                                   \
-    template<> AFAPI                                                    \
-    T* pinned(const size_t elements)                                    \
-    {                                                                   \
-        return (T*)pinned(elements, (af::dtype)dtype_traits<T>::af_type); \
+#define INSTANTIATE(T)                                                      \
+    template<> AFAPI                                                        \
+    T* alloc(const size_t elements)                                         \
+    {                                                                       \
+        return (T*)alloc(elements, (af::dtype)dtype_traits<T>::af_type);    \
+    }                                                                       \
+    template<> AFAPI                                                        \
+    T* pinned(const size_t elements)                                        \
+    {                                                                       \
+        return (T*)pinned(elements, (af::dtype)dtype_traits<T>::af_type);   \
+    }                                                                       \
+    template<> AFAPI                                                        \
+    T* allocHost(const size_t elements)                                     \
+    {                                                                       \
+        return (T*)allocHost(elements, (af::dtype)dtype_traits<T>::af_type);\
     }
 
     INSTANTIATE(float)
diff --git a/src/api/cpp/error.hpp b/src/api/cpp/error.hpp
index 157f8193ab..c888db8646 100644
--- a/src/api/cpp/error.hpp
+++ b/src/api/cpp/error.hpp
@@ -8,14 +8,20 @@
  ********************************************************/
 
 #include <af/exception.h>
+#include <af/device.h>
 #include <defines.hpp>
 
 #define AF_THROW(fn) do {                               \
         af_err __err = fn;                              \
         if (__err == AF_SUCCESS) break;                 \
-        throw af::exception(__AF_FILENAME__, __LINE__, __err); \
+        char *msg = NULL; af_get_last_error(&msg, NULL);\
+        af::exception ex(msg, __PRETTY_FUNCTION__,      \
+                __AF_FILENAME__, __LINE__, __err);      \
+        af_free_host(msg);                              \
+        throw ex;                                       \
     } while(0)
 
-#define AF_THROW_ERR(__msg, __err) do {                         \
-        throw af::exception(__msg, __AF_FILENAME__, __LINE__, __err);  \
+#define AF_THROW_ERR(__msg, __err) do {                 \
+        throw af::exception(__msg, __PRETTY_FUNCTION__, \
+                __AF_FILENAME__, __LINE__, __err);      \
     } while(0)
diff --git a/src/api/cpp/exception.cpp b/src/api/cpp/exception.cpp
index 373ae29c55..f88f98b0f2 100644
--- a/src/api/cpp/exception.cpp
+++ b/src/api/cpp/exception.cpp
@@ -32,8 +32,8 @@ exception::exception(const char *msg): m_err(AF_ERR_UNKNOWN)
 exception::exception(const char *file, unsigned line, af_err err): m_err(err)
 {
     snprintf(m_msg, sizeof(m_msg) - 1,
-             "ArrayFire Exception(%d): %s\nIn %s:%u",
-             (int)err, af_err_to_string(err), file, line);
+             "ArrayFire Exception (%s:%d):\nIn %s:%u",
+             af_err_to_string(err), (int)err, file, line);
 
     m_msg[sizeof(m_msg)-1] = '\0';
 }
@@ -41,11 +41,19 @@ exception::exception(const char *file, unsigned line, af_err err): m_err(err)
 exception::exception(const char *msg, const char *file, unsigned line, af_err err): m_err(err)
 {
     snprintf(m_msg, sizeof(m_msg) - 1,
-             "ArrayFire Exception(%d): %s\nIn %s:%u",
-             (int)(err), msg, file, line);
+             "ArrayFire Exception (%s:%d):\n%s\nIn %s:%u",
+             af_err_to_string(err), (int)(err), msg, file, line);
 
     m_msg[sizeof(m_msg)-1] = '\0';
 }
 
+exception::exception(const char *msg, const char *func, const char *file, unsigned line, af_err err): m_err(err)
+{
+    snprintf(m_msg, sizeof(m_msg) - 1,
+             "ArrayFire Exception (%s:%d):\n%s\nIn function %s\nIn file %s:%u",
+             af_err_to_string(err), (int)(err), msg, func, file, line);
+
+    m_msg[sizeof(m_msg)-1] = '\0';
+}
 
 }
diff --git a/src/api/cpp/graphics.cpp b/src/api/cpp/graphics.cpp
index b7480195dc..8b53825c25 100644
--- a/src/api/cpp/graphics.cpp
+++ b/src/api/cpp/graphics.cpp
@@ -79,6 +79,18 @@ void Window::plot(const array& X, const array& Y, const char* const title)
     AF_THROW(af_draw_plot(get(), X.get(), Y.get(), &temp));
 }
 
+void Window::scatter(const array& X, const array& Y, af::markerType marker, const char* const title)
+{
+    af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT};
+    AF_THROW(af_draw_scatter(get(), X.get(), Y.get(), marker, &temp));
+}
+
+void Window::scatter3(const array& P, af::markerType marker, const char* const title)
+{
+    af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT};
+    AF_THROW(af_draw_scatter3(get(), P.get(), marker, &temp));
+}
+
 void Window::plot3(const array& P, const char* const title)
 {
     af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT};
@@ -93,7 +105,6 @@ void Window::hist(const array& X, const double minval, const double maxval, cons
 }
 
 void Window::surface(const array& S, const char* const title){
-    //TODO: fix offset on forge?
     af::array xVals = seq(0, S.dims(0)-1);
     af::array yVals = seq(0, S.dims(1)-1);
     af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT};
@@ -125,4 +136,9 @@ bool Window::close()
     return temp;
 }
 
+void Window::setVisibility(const bool isVisible)
+{
+    AF_THROW(af_set_visibility(get(), isVisible));
+}
+
 }
diff --git a/src/api/cpp/imageio.cpp b/src/api/cpp/imageio.cpp
index e70b26d1d2..75ef5fe9c4 100644
--- a/src/api/cpp/imageio.cpp
+++ b/src/api/cpp/imageio.cpp
@@ -68,4 +68,11 @@ void saveImageNative(const char* filename, const array& in)
     AF_THROW(af_save_image_native(filename, in.get()));
 }
 
+bool isImageIOAvailable()
+{
+    bool out = false;
+    AF_THROW(af_is_image_io_available(&out));
+    return out;
+}
+
 }
diff --git a/src/api/cpp/internal.cpp b/src/api/cpp/internal.cpp
new file mode 100644
index 0000000000..bdce6e155c
--- /dev/null
+++ b/src/api/cpp/internal.cpp
@@ -0,0 +1,63 @@
+/*******************************************************
+ * Copyright (c) 2016, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/internal.h>
+#include <af/array.h>
+#include "error.hpp"
+
+namespace af
+{
+    array createStridedArray(const void *data, const dim_t offset,
+                             const dim4 dims, const dim4 strides,
+                             const af::dtype ty,
+                             const af::source location)
+    {
+        af_array res;
+        AF_THROW(af_create_strided_array(&res, data, offset,
+                                         dims.ndims(), dims.get(), strides.get(),
+                                         ty, location));
+        return array(res);
+    }
+
+    dim4 getStrides(const array &in)
+    {
+        dim_t s0, s1, s2, s3;
+        AF_THROW(af_get_strides(&s0, &s1, &s2, &s3, in.get()));
+        return dim4(s0, s1, s2, s3);
+    }
+
+    dim_t getOffset(const array &in)
+    {
+        dim_t offset;
+        AF_THROW(af_get_offset(&offset, in.get()));
+        return offset;
+    }
+
+    void *getRawPtr(const array &in)
+    {
+        void *ptr = NULL;
+        AF_THROW(af_get_raw_ptr(&ptr, in.get()));
+        return ptr;
+    }
+
+    bool isLinear(const array &in)
+    {
+        bool is_linear = false;
+        AF_THROW(af_is_linear(&is_linear, in.get()));
+        return is_linear;
+    }
+
+    bool isOwner(const array &in)
+    {
+        bool is_owner = false;
+        AF_THROW(af_is_owner(&is_owner, in.get()));
+        return is_owner;
+    }
+
+}
diff --git a/src/api/cpp/lapack.cpp b/src/api/cpp/lapack.cpp
index cf9b3ecfd2..091c807612 100644
--- a/src/api/cpp/lapack.cpp
+++ b/src/api/cpp/lapack.cpp
@@ -153,4 +153,11 @@ namespace af
         AF_THROW(af_norm(&out, in.get(), type, p, q));
         return out;
     }
+
+    bool isLAPACKAvailable()
+    {
+        bool out = false;
+        AF_THROW(af_is_lapack_available(&out));
+        return out;
+    }
 }
diff --git a/src/api/cpp/transform_coordinates.cpp b/src/api/cpp/transform_coordinates.cpp
new file mode 100644
index 0000000000..4d896e7194
--- /dev/null
+++ b/src/api/cpp/transform_coordinates.cpp
@@ -0,0 +1,24 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/image.h>
+#include <af/array.h>
+#include "error.hpp"
+
+namespace af
+{
+
+array transformCoordinates(const array& tf, const float d0, const float d1)
+{
+    af_array out = 0;
+    AF_THROW(af_transform_coordinates(&out, tf.get(), d0, d1));
+    return array(out);
+}
+
+}
diff --git a/src/api/cpp/util.cpp b/src/api/cpp/util.cpp
index a99b8567e0..895d347d92 100644
--- a/src/api/cpp/util.cpp
+++ b/src/api/cpp/util.cpp
@@ -62,4 +62,10 @@ namespace af
         return;
     }
 
+    const char* toString(const char *exp, const array &arr, const int precision, const bool transpose)
+    {
+        char *output = NULL;
+        AF_THROW(af_array_to_string(&output, exp, arr.get(), precision, transpose));
+        return output;
+    }
 }
diff --git a/src/api/unified/CMakeLists.txt b/src/api/unified/CMakeLists.txt
index 917c6dce42..c44e43b5fc 100644
--- a/src/api/unified/CMakeLists.txt
+++ b/src/api/unified/CMakeLists.txt
@@ -15,11 +15,12 @@ FILE(GLOB cpp_sources
 SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources})
 
 FILE(GLOB common_sources
-    "../c/util.cpp"
-    "../c/err_common.cpp"
-    "../c/type_util.cpp"
-    "../../backend/dim4.cpp"
-    )
+  "../c/version.cpp"
+  "../c/err_common.cpp"
+  "../c/type_util.cpp"
+  "../../backend/dim4.cpp"
+  "../../backend/util.cpp"
+  )
 
 SOURCE_GROUP(common FILES ${common_sources})
 
@@ -30,10 +31,6 @@ ENDIF()
 # OS Definitions
 IF(UNIX)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread -Wno-comment")
-ELSE(${UNIX}) #Windows
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
-    SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj")
 ENDIF()
 
 ADD_LIBRARY(af SHARED
diff --git a/src/api/unified/array.cpp b/src/api/unified/array.cpp
index 59158ca195..809c9d4e6b 100644
--- a/src/api/unified/array.cpp
+++ b/src/api/unified/array.cpp
@@ -8,6 +8,7 @@
  ********************************************************/
 
 #include <af/array.h>
+#include <af/backend.h>
 #include "symbol_manager.hpp"
 
 af_err af_create_array(af_array *arr, const void * const data, const unsigned ndims, const dim_t * const dims, const af_dtype type)
@@ -40,8 +41,16 @@ af_err af_get_data_ptr(void *data, const af_array arr)
 
 af_err af_release_array(af_array arr)
 {
-    CHECK_ARRAYS(arr);
-    return CALL(arr);
+    af_backend curr = unified::AFSymbolManager::getInstance().getActiveBackend();
+    af_backend other = curr;
+
+    af_err err = af_get_backend_id(&other, arr);
+    if (err != AF_SUCCESS) return err;
+
+    unified::AFSymbolManager::getInstance().setBackend(other);
+    err = CALL(arr);
+    unified::AFSymbolManager::getInstance().setBackend(curr);
+    return err;
 }
 
 af_err af_retain_array(af_array *out, const af_array in)
diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp
index 43559a077a..ed8e6a37f6 100644
--- a/src/api/unified/device.cpp
+++ b/src/api/unified/device.cpp
@@ -35,6 +35,18 @@ af_err af_get_backend_id(af_backend *result, const af_array in)
     return CALL(result, in);
 }
 
+af_err af_get_device_id(int *device, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(device, in);
+}
+
+af_err af_get_active_backend(af_backend *result)
+{
+    *result = unified::AFSymbolManager::getInstance().getActiveBackend();
+    return AF_SUCCESS;
+}
+
 af_err af_info()
 {
     return CALL_NO_PARAMS();
@@ -45,6 +57,11 @@ af_err af_init()
     return CALL_NO_PARAMS();
 }
 
+af_err af_info_string(char **str, const bool verbose)
+{
+    return CALL(str, verbose);
+}
+
 af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute)
 {
     return CALL(d_name, d_platform, d_toolkit, d_compute);
@@ -95,6 +112,16 @@ af_err af_free_pinned(void *ptr)
     return CALL(ptr);
 }
 
+af_err af_alloc_host(void **ptr, const dim_t bytes)
+{
+    return CALL(ptr, bytes);
+}
+
+af_err af_free_host(void *ptr)
+{
+    return CALL(ptr);
+}
+
 af_err af_device_array(af_array *arr, const void *data, const unsigned ndims, const dim_t * const dims, const af_dtype type)
 {
     return CALL(arr, data, ndims, dims, type);
@@ -106,6 +133,11 @@ af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers,
     return CALL(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers);
 }
 
+af_err af_print_mem_info(const char *msg, const int device_id)
+{
+    return CALL(msg, device_id);
+}
+
 af_err af_device_gc()
 {
     return CALL_NO_PARAMS();
@@ -133,6 +165,18 @@ af_err af_unlock_device_ptr(const af_array arr)
     return CALL(arr);
 }
 
+af_err af_lock_array(const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(arr);
+}
+
+af_err af_unlock_array(const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(arr);
+}
+
 af_err af_get_device_ptr(void **ptr, const af_array arr)
 {
     CHECK_ARRAYS(arr);
diff --git a/src/api/unified/error.cpp b/src/api/unified/error.cpp
new file mode 100644
index 0000000000..0224876ec3
--- /dev/null
+++ b/src/api/unified/error.cpp
@@ -0,0 +1,51 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/exception.h>
+#include <af/device.h>
+#include <algorithm>
+#include "symbol_manager.hpp"
+
+void af_get_last_error(char **str, dim_t *len)
+{
+    // Set error message from unified backend
+    std::string &global_error_string = get_global_error_string();
+    dim_t slen = std::min(MAX_ERR_SIZE, (int)global_error_string.size());
+
+    // If this is true, the error is coming from the unified backend.
+    if (slen != 0) {
+
+        if (len && slen == 0) {
+            *len = 0;
+            *str = NULL;
+            return;
+        }
+
+        af_alloc_host((void**)str, sizeof(char) * (slen + 1));
+        global_error_string.copy(*str, slen);
+
+        (*str)[slen] = '\0';
+        global_error_string = std::string("");
+
+        if (len) *len = slen;
+    } else {
+        // If false, the error is coming from active backend.
+        typedef void(*af_func)(char **, dim_t *);
+        af_func func = (af_func)LOAD_SYMBOL();
+        func(str, len);
+    }
+}
+
+const char *af_err_to_string(const af_err err)
+{
+    typedef char *(*af_func)(af_err);
+    af_func func = (af_func)LOAD_SYMBOL();
+    return func(err);
+}
diff --git a/src/api/unified/graphics.cpp b/src/api/unified/graphics.cpp
index 81076f233c..9e3f1c8b38 100644
--- a/src/api/unified/graphics.cpp
+++ b/src/api/unified/graphics.cpp
@@ -44,6 +44,18 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co
     return CALL(wind, X, Y, props);
 }
 
+af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_marker_type marker, const af_cell* const props)
+{
+    CHECK_ARRAYS(X, Y);
+    return CALL(wind, X, Y, marker, props);
+}
+
+af_err af_draw_scatter3(const af_window wind, const af_array P, const af_marker_type marker, const af_cell* const props)
+{
+    CHECK_ARRAYS(P);
+    return CALL(wind, P, marker, props);
+}
+
 af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props)
 {
     CHECK_ARRAYS(P);
@@ -77,6 +89,11 @@ af_err af_is_window_closed(bool *out, const af_window wind)
     return CALL(out, wind);
 }
 
+af_err af_set_visibility(const af_window wind, const bool is_visible)
+{
+    return CALL(wind, is_visible);
+}
+
 af_err af_destroy_window(const af_window wind)
 {
     return CALL(wind);
diff --git a/src/api/unified/image.cpp b/src/api/unified/image.cpp
index d0f9aa6200..0ee211d585 100644
--- a/src/api/unified/image.cpp
+++ b/src/api/unified/image.cpp
@@ -55,6 +55,11 @@ af_err af_save_image_native(const char* filename, const af_array in)
     return CALL(filename, in);
 }
 
+af_err af_is_image_io_available(bool *out)
+{
+    return CALL(out);
+}
+
 af_err af_resize(af_array *out, const af_array in, const dim_t odim0, const dim_t odim1, const af_interp_type method)
 {
     CHECK_ARRAYS(in);
@@ -69,6 +74,13 @@ af_err af_transform(af_array *out, const af_array in, const af_array transform,
     return CALL(out, in, transform, odim0, odim1, method, inverse);
 }
 
+af_err af_transform_coordinates(af_array *out, const af_array tf,
+        const float d0, const float d1)
+{
+    CHECK_ARRAYS(tf);
+    return CALL(out, tf, d0, d1);
+}
+
 af_err af_rotate(af_array *out, const af_array in, const float theta,
         const bool crop, const af_interp_type method)
 {
diff --git a/src/api/unified/index.cpp b/src/api/unified/index.cpp
index 0927dd8b71..4df5926d62 100644
--- a/src/api/unified/index.cpp
+++ b/src/api/unified/index.cpp
@@ -52,3 +52,37 @@ af_err af_assign_gen( af_array *out,
     CHECK_ARRAYS(lhs, rhs);
     return CALL(out, lhs, ndims, indices, rhs);
 }
+
+af_seq af_make_seq(double begin, double end, double step)
+{
+    af_seq seq = {begin, end, step};
+    return seq;
+}
+
+af_err af_create_indexers(af_index_t** indexers)
+{
+    return CALL(indexers);
+}
+
+af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim)
+{
+    CHECK_ARRAYS(idx);
+    return CALL(indexer, idx, dim);
+}
+
+af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch)
+{
+    return CALL(indexer, idx, dim, is_batch);
+}
+
+af_err af_set_seq_param_indexer(af_index_t* indexer,
+                              const double begin, const double end, const double step,
+                              const dim_t dim, const bool is_batch)
+{
+    return CALL(indexer, begin, end, step, dim, is_batch);
+}
+
+af_err af_release_indexers(af_index_t* indexers)
+{
+    return CALL(indexers);
+}
diff --git a/src/api/unified/internal.cpp b/src/api/unified/internal.cpp
new file mode 100644
index 0000000000..b9ac0ac277
--- /dev/null
+++ b/src/api/unified/internal.cpp
@@ -0,0 +1,54 @@
+/*******************************************************
+ * Copyright (c) 2016, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/internal.h>
+#include "symbol_manager.hpp"
+
+
+af_err af_create_strided_array(af_array *arr,
+                               const void *data,
+                               const dim_t offset,
+                               const unsigned ndims,
+                               const dim_t *const dims_,
+                               const dim_t *const strides_,
+                               const af_dtype ty,
+                               const af_source location)
+{
+    return CALL(arr, data, offset, ndims, dims_, strides_, ty, location);
+}
+
+af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(s0, s1, s2, s3, in);
+}
+
+af_err af_get_offset(dim_t *offset, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(offset, arr);
+}
+
+af_err af_get_raw_ptr(void **ptr, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(ptr, arr);
+}
+
+af_err af_is_linear(bool *result, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(result, arr);
+}
+
+af_err af_is_owner(bool *result, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(result, arr);
+}
diff --git a/src/api/unified/lapack.cpp b/src/api/unified/lapack.cpp
index b2364ac858..8a367017cf 100644
--- a/src/api/unified/lapack.cpp
+++ b/src/api/unified/lapack.cpp
@@ -96,3 +96,8 @@ af_err af_norm(double *out, const af_array in, const af_norm_type type, const do
     CHECK_ARRAYS(in);
     return CALL(out, in, type, p, q);
 }
+
+af_err af_is_lapack_available(bool *out)
+{
+    return CALL(out);
+}
diff --git a/src/api/unified/symbol_manager.cpp b/src/api/unified/symbol_manager.cpp
index f721e30c5f..96cec0b6ac 100644
--- a/src/api/unified/symbol_manager.cpp
+++ b/src/api/unified/symbol_manager.cpp
@@ -43,25 +43,6 @@ inline string getBkndLibName(const int backend_index)
     return LIB_AF_BKND_PREFIX + LIB_AF_BKND_NAME[i] + LIB_AF_BKND_SUFFIX;
 }
 
-inline std::string getEnvVar(const std::string &key)
-{
-#if defined(OS_WIN)
-    DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation
-    string retVal;
-    retVal.resize(bufSize);
-    bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize);
-    if (!bufSize) {
-        return string("");
-    } else {
-        retVal.resize(bufSize);
-        return retVal;
-    }
-#else
-    char * str = getenv(key.c_str());
-    return str==NULL ? string("") : string(str);
-#endif
-}
-
 /*flag parameter is not used on windows platform */
 LibHandle openDynLibrary(const int bknd_idx, int flag=RTLD_LAZY)
 {
@@ -222,8 +203,9 @@ af_err AFSymbolManager::setBackend(af::Backend bknd)
             activeHandle = defaultHandle;
             activeBackend = defaultBackend;
             return AF_SUCCESS;
-        } else
-            return AF_ERR_LOAD_LIB;
+        } else {
+            UNIFIED_ERROR_LOAD_LIB();
+        }
     }
     int idx = bknd >> 1;    // Convert 1, 2, 4 -> 0, 1, 2
     if(bkndHandles[idx]) {
@@ -231,7 +213,7 @@ af_err AFSymbolManager::setBackend(af::Backend bknd)
         activeBackend = bknd;
         return AF_SUCCESS;
     } else {
-        return AF_ERR_LOAD_LIB;
+        UNIFIED_ERROR_LOAD_LIB();
     }
 }
 
diff --git a/src/api/unified/symbol_manager.hpp b/src/api/unified/symbol_manager.hpp
index f26e708728..658ac74b64 100644
--- a/src/api/unified/symbol_manager.hpp
+++ b/src/api/unified/symbol_manager.hpp
@@ -11,6 +11,9 @@
 #include <af/defines.h>
 #include <string>
 #include <stdlib.h>
+#include <util.hpp>
+#include <err_common.hpp>
+
 #if defined(OS_WIN)
 #include <Windows.h>
 typedef HMODULE LibHandle;
@@ -25,6 +28,13 @@ namespace unified
 const int NUM_BACKENDS = 3;
 const int NUM_ENV_VARS = 2;
 
+#define UNIFIED_ERROR_LOAD_LIB()                                        \
+    AF_RETURN_ERROR("Failed to load dynamic library. "                  \
+                    "See http://www.arrayfire.com/docs/unifiedbackend.htm " \
+                    "for instructions to set up environment for Unified backend.", \
+                    AF_ERR_LOAD_LIB)
+
+
 class AFSymbolManager {
     public:
         static AFSymbolManager& getInstance();
@@ -41,8 +51,9 @@ class AFSymbolManager {
 
         template<typename... CalleeArgs>
         af_err call(const char* symbolName, CalleeArgs... args) {
-            if (!activeHandle)
-                return AF_ERR_LOAD_LIB;
+            if (!activeHandle) {
+                UNIFIED_ERROR_LOAD_LIB();
+            }
             typedef af_err(*af_func)(CalleeArgs...);
             af_func funcHandle;
 #if defined(OS_WIN)
@@ -51,12 +62,17 @@ class AFSymbolManager {
             funcHandle = (af_func)dlsym(activeHandle, symbolName);
 #endif
             if (!funcHandle) {
-                return AF_ERR_LOAD_SYM;
+                std::string str = "Failed to load symbol: ";
+                str += symbolName;
+                AF_RETURN_ERROR(str.c_str(),
+                                AF_ERR_LOAD_SYM);
             }
 
             return funcHandle(args...);
         }
 
+        LibHandle getHandle() { return activeHandle; }
+
     protected:
         AFSymbolManager();
 
@@ -93,11 +109,12 @@ bool checkArrays(af_backend activeBackend, T a, Args... arg)
 
 // Macro to check af_array as inputs. The arguments to this macro should be
 // only input af_arrays. Not outputs or other types.
-#define CHECK_ARRAYS(...) do {                                                              \
-    af_backend backendId = unified::AFSymbolManager::getInstance().getActiveBackend();      \
-    if(!unified::checkArrays(backendId, __VA_ARGS__))                                       \
-        return AF_ERR_ARR_BKND_MISMATCH;                                                    \
-} while(0)
+#define CHECK_ARRAYS(...) do {                                          \
+        af_backend backendId = unified::AFSymbolManager::getInstance().getActiveBackend(); \
+        if(!unified::checkArrays(backendId, __VA_ARGS__))               \
+            AF_RETURN_ERROR("Input array does not belong to current backend", \
+                            AF_ERR_ARR_BKND_MISMATCH);                  \
+    } while(0)
 
 #if defined(OS_WIN)
 #define CALL(...) unified::AFSymbolManager::getInstance().call(__FUNCTION__, __VA_ARGS__)
@@ -106,3 +123,9 @@ bool checkArrays(af_backend activeBackend, T a, Args... arg)
 #define CALL(...) unified::AFSymbolManager::getInstance().call(__func__, __VA_ARGS__)
 #define CALL_NO_PARAMS() unified::AFSymbolManager::getInstance().call(__func__)
 #endif
+
+#if defined(OS_WIN)
+#define LOAD_SYMBOL() GetProcAddress(unified::AFSymbolManager::getInstance().getHandle(), __FUNCTION__)
+#else
+#define LOAD_SYMBOL() dlsym(unified::AFSymbolManager::getInstance().getHandle(), __func__)
+#endif
diff --git a/src/api/unified/util.cpp b/src/api/unified/util.cpp
index 155c4f81b9..178ac87ad8 100644
--- a/src/api/unified/util.cpp
+++ b/src/api/unified/util.cpp
@@ -56,8 +56,3 @@ af_err af_example_function(af_array* out, const af_array in, const af_someenum_t
     CHECK_ARRAYS(in);
     return CALL(out, in, param);
 }
-
-af_err af_get_version(int *major, int *minor, int *patch)
-{
-    return CALL(major, minor, patch);
-}
diff --git a/src/backend/ArrayInfo.cpp b/src/backend/ArrayInfo.cpp
index 219bc1991c..0937641afc 100644
--- a/src/backend/ArrayInfo.cpp
+++ b/src/backend/ArrayInfo.cpp
@@ -18,35 +18,6 @@
 
 using af::dim4;
 
-dim_t
-calcOffset(const af::dim4 &strides, const af::dim4 &offsets)
-{
-    dim_t offset = 0;
-    for (int i = 0; i < 4; i++) offset += offsets[i] * strides[i];
-    return offset;
-}
-
-
-const ArrayInfo&
-getInfo(af_array arr)
-{
-    const ArrayInfo *info = static_cast<ArrayInfo*>(reinterpret_cast<void *>(arr));
-    return *info;
-}
-
-af_err
-af_get_elements(dim_t *elems, const af_array arr)
-{
-    *elems =  getInfo(arr).elements();
-    return AF_SUCCESS; //FIXME: Catch exceptions correctly
-}
-
-af_err af_get_type(af_dtype *type, const af_array arr)
-{
-    *type = getInfo(arr).getType();
-    return AF_SUCCESS; //FIXME: Catch exceptions correctly
-}
-
 dim4 calcStrides(const dim4 &parentDim)
 {
     dim4 out(1, 1, 1, 1);
@@ -64,33 +35,33 @@ int ArrayInfo::getDevId() const
 {
     // The actual device ID is only stored in the first 4 bits of devId
     // See ArrayInfo.hpp for more
-    return devId & 0xf;
+    return devId & 0xff;
 }
 
 void ArrayInfo::setId(int id) const
 {
-    // 1 << (backendId + 3) sets the 4th, 5th or 6th bit of devId to 1
+    // 1 << (backendId + 8) sets the 9th, 10th or 11th bit of devId to 1
     // for CPU, CUDA and OpenCL respectively
     // See ArrayInfo.hpp for more
     int backendId = detail::getBackend() >> 1; // Convert enums 1, 2, 4 to ints 0, 1, 2
-    const_cast<ArrayInfo *>(this)->setId(id | 1 << (backendId + 3));
+    const_cast<ArrayInfo *>(this)->setId(id | 1 << (backendId + 8));
 }
 
 void ArrayInfo::setId(int id)
 {
-    // 1 << (backendId + 3) sets the 4th, 5th or 6th bit of devId to 1
+    // 1 << (backendId + 3) sets the 9th, 10th or 11th bit of devId to 1
     // for CPU, CUDA and OpenCL respectively
     // See ArrayInfo.hpp for more
     int backendId = detail::getBackend() >> 1; // Convert enums 1, 2, 4 to ints 0, 1, 2
-    devId = id | 1 << (backendId + 3);
+    devId = id | 1 << (backendId + 8);
 }
 
 af_backend ArrayInfo::getBackendId() const
 {
-    // devId >> 3 converts the backend info to 1, 2, 4 which are enums
+    // devId >> 8 converts the backend info to 1, 2, 4 which are enums
     // for CPU, CUDA and OpenCL respectively
     // See ArrayInfo.hpp for more
-    int backendId = devId >> 3;
+    int backendId = devId >> 8;
     return (af_backend)backendId;
 }
 
diff --git a/src/backend/ArrayInfo.hpp b/src/backend/ArrayInfo.hpp
index ca6fcd394c..88ba26b6aa 100644
--- a/src/backend/ArrayInfo.hpp
+++ b/src/backend/ArrayInfo.hpp
@@ -16,9 +16,6 @@
 #include <vector>
 #include <cstddef>
 
-dim_t
-calcOffset(const af::dim4 &strides, const af::dim4 &offsets);
-
 af::dim4
 calcStrides(const af::dim4 &parentDim);
 
@@ -48,14 +45,15 @@ class ArrayInfo
     int             devId;
     af_dtype        type;
     af::dim4        dim_size;
-    af::dim4        dim_offsets, dim_strides;
+    dim_t           offset;
+    af::dim4        dim_strides;
 
 public:
-    ArrayInfo(int id, af::dim4 size, af::dim4 offset, af::dim4 stride, af_dtype af_type):
+    ArrayInfo(int id, af::dim4 size, dim_t offset_, af::dim4 stride, af_dtype af_type):
         devId(id),
         type(af_type),
         dim_size(size),
-        dim_offsets(offset),
+        offset(offset_),
         dim_strides(stride)
     {
         af_init();
@@ -77,13 +75,14 @@ class ArrayInfo
 
     const af_dtype& getType() const     { return type;                  }
 
-    const af::dim4& offsets() const     { return dim_offsets;           }
+    dim_t getOffset() const             { return offset;                }
 
     const af::dim4& strides() const     { return dim_strides;           }
 
     size_t elements() const             { return dim_size.elements();   }
     size_t ndims() const                { return dim_size.ndims();      }
     const af::dim4& dims() const        { return dim_size;              }
+    size_t total() const                { return offset + dim_strides[3] * dim_size[3]; }
 
     int getDevId() const;
 
@@ -97,7 +96,7 @@ class ArrayInfo
     {
         dim_size = dims;
         dim_strides = calcStrides(dims);
-        dim_offsets = af::dim4(0,0,0,0);
+        offset = 0;
     }
 
     void resetDims(const af::dim4& dims)
@@ -141,12 +140,6 @@ class ArrayInfo
     static_assert(std::is_standard_layout<ArrayInfo>::value, "ArrayInfo must be a standard layout type");
 #endif
 
-// Returns size and time info for an array object.
-// Note this doesn't require template parameters.
-const  ArrayInfo&
-getInfo(const af_array arr);
-
-
 af::dim4 toDims(const std::vector<af_seq>& seqs, const af::dim4 &parentDims);
 
 af::dim4 toOffset(const std::vector<af_seq>& seqs, const af::dim4 &parentDims);
diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp
new file mode 100644
index 0000000000..83f2de1d8d
--- /dev/null
+++ b/src/backend/MemoryManager.cpp
@@ -0,0 +1,319 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <algorithm>
+#include "MemoryManager.hpp"
+#include "dispatch.hpp"
+#include "err_common.hpp"
+#include "util.hpp"
+
+namespace common
+{
+
+MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug):
+    mem_step_size(1024),
+    max_buffers(MAX_BUFFERS),
+    memory(num_devices),
+    debug_mode(debug)
+{
+    lock_guard_t lock(this->memory_mutex);
+
+    for (int n = 0; n < num_devices; n++) {
+        // Calling getMaxMemorySize() here calls the virtual function that returns 0
+        // Call it from outside the constructor.
+        memory[n].max_bytes     = ONE_GB;
+        memory[n].total_bytes   = 0;
+        memory[n].total_buffers = 0;
+        memory[n].lock_bytes    = 0;
+        memory[n].lock_buffers  = 0;
+    }
+
+    // Check for environment variables
+
+    std::string env_var;
+
+    // Debug mode
+    env_var = getEnvVar("AF_MEM_DEBUG");
+    if (!env_var.empty()) {
+        this->debug_mode = env_var[0] != '0';
+    }
+    if (this->debug_mode) mem_step_size = 1;
+
+    // Max Buffer count
+    env_var = getEnvVar("AF_MAX_BUFFERS");
+    if (!env_var.empty()) {
+        this->max_buffers = std::max(1, std::stoi(env_var));
+    }
+}
+
+void MemoryManager::setMaxMemorySize()
+{
+    for (unsigned n = 0; n < memory.size(); n++) {
+        // Calls garbage collection when:
+        // total_bytes > memsize * 0.75 when memsize <  4GB
+        // total_bytes > memsize - 1 GB when memsize >= 4GB
+        // If memsize returned 0, then use 1GB
+        size_t memsize = this->getMaxMemorySize(n);
+        memory[n].max_bytes = memsize == 0 ? ONE_GB : std::max(memsize * 0.75, (double)(memsize - ONE_GB));
+    }
+}
+
+void MemoryManager::garbageCollect()
+{
+    if (this->debug_mode) return;
+
+    lock_guard_t lock(this->memory_mutex);
+    memory_info& current = this->getCurrentMemoryInfo();
+
+    // Return if all buffers are locked
+    if (current.total_buffers == current.lock_buffers) return;
+
+    for (auto &kv : current.free_map) {
+        size_t num_ptrs = kv.second.size();
+        //Free memory by popping the last element
+        for (int n = num_ptrs-1; n >= 0; n--) {
+            this->nativeFree(kv.second[n]);
+            current.total_bytes -= kv.first;
+            current.total_buffers--;
+            kv.second.pop_back();
+        }
+    }
+    current.free_map.clear();
+}
+
+void MemoryManager::unlock(void *ptr, bool user_unlock)
+{
+    // Shortcut for empty arrays
+    if (!ptr) return;
+
+    lock_guard_t lock(this->memory_mutex);
+    memory_info& current = this->getCurrentMemoryInfo();
+
+    locked_iter iter = current.locked_map.find((void *)ptr);
+
+    // Pointer not found in locked map
+    if (iter == current.locked_map.end()) {
+        // Probably came from user, just free it
+        this->nativeFree(ptr);
+        return;
+    }
+
+    if (user_unlock) {
+        (iter->second).user_lock = false;
+    } else {
+        (iter->second).manager_lock = false;
+    }
+
+    // Return early if either one is locked
+    if ((iter->second).user_lock || (iter->second).manager_lock) return;
+
+    size_t bytes = iter->second.bytes;
+    current.lock_bytes -= iter->second.bytes;
+    current.lock_buffers--;
+
+    current.locked_map.erase(iter);
+
+    if (this->debug_mode) {
+        // Just free memory in debug mode
+        if ((iter->second).bytes > 0) {
+            this->nativeFree(iter->first);
+            current.total_buffers--;
+            current.total_bytes -= iter->second.bytes;
+        }
+    } else {
+        // In regular mode, move buffer to free map
+        free_iter fiter = current.free_map.find(bytes);
+        if (fiter != current.free_map.end()) {
+            // If found, push back
+            fiter->second.push_back(ptr);
+        } else {
+            // If not found, create new vector for this size
+            std::vector<void *> ptrs;
+            ptrs.push_back(ptr);
+            current.free_map[bytes] = ptrs;
+        }
+    }
+}
+
+void *MemoryManager::alloc(const size_t bytes, bool user_lock)
+{
+    lock_guard_t lock(this->memory_mutex);
+
+    void *ptr = NULL;
+    size_t alloc_bytes = this->debug_mode ? bytes : (divup(bytes, mem_step_size) * mem_step_size);
+
+    if (bytes > 0) {
+        memory_info& current = this->getCurrentMemoryInfo();
+
+        // There is no memory cache in debug mode
+        if (!this->debug_mode) {
+
+            // FIXME: Add better checks for garbage collection
+            // Perhaps look at total memory available as a metric
+            if (this->checkMemoryLimit()) {
+                this->garbageCollect();
+            }
+
+            free_iter iter = current.free_map.find(alloc_bytes);
+
+            if (iter != current.free_map.end() && !iter->second.empty()) {
+                ptr = iter->second.back();
+                iter->second.pop_back();
+            }
+
+        }
+
+        // Only comes here if buffer size not found or in debug mode
+        if (ptr == NULL) {
+            // Perform garbage collection if memory can not be allocated
+            try {
+                ptr = this->nativeAlloc(alloc_bytes);
+            } catch (AfError &ex) {
+                // If out of memory, run garbage collect and try again
+                if (ex.getError() != AF_ERR_NO_MEM) throw;
+                this->garbageCollect();
+                ptr = this->nativeAlloc(alloc_bytes);
+            }
+            // Increment these two only when it succeeds to come here.
+            current.total_bytes += alloc_bytes;
+            current.total_buffers += 1;
+        }
+
+
+        locked_info info = {true, user_lock, alloc_bytes};
+        current.locked_map[ptr] = info;
+        current.lock_bytes += alloc_bytes;
+        current.lock_buffers++;
+    }
+    return ptr;
+}
+
+void MemoryManager::userLock(const void *ptr)
+{
+    memory_info& current = this->getCurrentMemoryInfo();
+
+    lock_guard_t lock(this->memory_mutex);
+
+    locked_iter iter = current.locked_map.find(const_cast<void *>(ptr));
+
+    if (iter != current.locked_map.end()) {
+        iter->second.user_lock = true;
+    } else {
+        locked_info info = {false,
+                            true,
+                            100}; //This number is not relevant
+
+        current.locked_map[(void *)ptr] = info;
+    }
+}
+
+void MemoryManager::userUnlock(const void *ptr)
+{
+    this->unlock(const_cast<void *>(ptr), true);
+}
+
+size_t MemoryManager::getMemStepSize()
+{
+    lock_guard_t lock(this->memory_mutex);
+    return this->mem_step_size;
+}
+
+void MemoryManager::setMemStepSize(size_t new_step_size)
+{
+    lock_guard_t lock(this->memory_mutex);
+    this->mem_step_size = new_step_size;
+}
+
+size_t MemoryManager::getMaxBytes()
+{
+    lock_guard_t lock(this->memory_mutex);
+    return this->getCurrentMemoryInfo().max_bytes;
+}
+
+void MemoryManager::printInfo(const char *msg, const int device)
+{
+    lock_guard_t lock(this->memory_mutex);
+    memory_info& current = this->getCurrentMemoryInfo();
+
+    std::cout << msg << std::endl;
+
+    static const std::string head("|     POINTER      |    SIZE    |  AF LOCK  | USER LOCK |");
+    static const std::string line(head.size(), '-');
+    std::cout << line << std::endl << head << std::endl << line << std::endl;
+
+    for(auto& kv : current.locked_map) {
+        std::string status_mngr("Yes");
+        std::string status_user("Unknown");
+        if(kv.second.user_lock)     status_user = "Yes";
+        else                        status_user = " No";
+
+        std::string unit = "KB";
+        double size = (double)(kv.second.bytes) / 1024;
+        if(size >= 1024) {
+            size = size / 1024;
+            unit = "MB";
+        }
+
+        std::cout << "|  " << std::right << std::setw(14) << kv.first << " "
+                  << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit
+                  << " | " << std::setw(9) << status_mngr
+                  << " | " << std::setw(9) << status_user
+                  << " |"  << std::endl;
+    }
+
+    for(auto &kv : current.free_map) {
+
+        std::string status_mngr("No");
+        std::string status_user("No");
+
+        std::string unit = "KB";
+        double size = (double)(kv.first) / 1024;
+        if(size >= 1024) {
+            size = size / 1024;
+            unit = "MB";
+        }
+
+        for (auto &ptr : kv.second) {
+            std::cout << "|  " << std::right << std::setw(14) << ptr << " "
+                      << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit
+                      << " | " << std::setw(9) << status_mngr
+                      << " | " << std::setw(9) << status_user
+                      << " |"  << std::endl;
+        }
+    }
+
+    std::cout << line << std::endl;
+}
+
+void MemoryManager::bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers,
+                               size_t *lock_bytes,  size_t *lock_buffers)
+{
+    lock_guard_t lock(this->memory_mutex);
+    memory_info current = this->getCurrentMemoryInfo();
+    if (alloc_bytes   ) *alloc_bytes   = current.total_bytes;
+    if (alloc_buffers ) *alloc_buffers = current.total_buffers;
+    if (lock_bytes    ) *lock_bytes    = current.lock_bytes;
+    if (lock_buffers  ) *lock_buffers  = current.lock_buffers;
+}
+
+unsigned MemoryManager::getMaxBuffers()
+{
+    return this->max_buffers;
+}
+
+bool MemoryManager::checkMemoryLimit()
+{
+    memory_info& current = this->getCurrentMemoryInfo();
+    return current.lock_bytes >= current.max_bytes || current.total_buffers >= this->max_buffers;
+}
+
+}
diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp
new file mode 100644
index 0000000000..0db70b572d
--- /dev/null
+++ b/src/backend/MemoryManager.hpp
@@ -0,0 +1,121 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+
+#include <vector>
+#include <map>
+#include <mutex>
+
+namespace common
+{
+
+typedef std::recursive_mutex mutex_t;
+typedef std::lock_guard<mutex_t> lock_guard_t;
+
+const unsigned MAX_BUFFERS   = 1000;
+const size_t ONE_GB = 1 << 30;
+
+class MemoryManager
+{
+    typedef struct
+    {
+        bool manager_lock;
+        bool user_lock;
+        size_t bytes;
+    } locked_info;
+
+    typedef std::map<void *, locked_info> locked_t;
+    typedef locked_t::iterator locked_iter;
+
+    typedef std::map<size_t, std::vector<void *> >free_t;
+    typedef free_t::iterator free_iter;
+
+    typedef struct
+    {
+        locked_t locked_map;
+        free_t   free_map;
+
+        size_t lock_bytes;
+        size_t lock_buffers;
+        size_t total_bytes;
+        size_t total_buffers;
+        size_t max_bytes;
+    } memory_info;
+
+    size_t mem_step_size;
+    unsigned max_buffers;
+    std::vector<memory_info> memory;
+    bool debug_mode;
+
+    memory_info& getCurrentMemoryInfo()
+    {
+        return memory[this->getActiveDeviceId()];
+    }
+
+    virtual int getActiveDeviceId()
+    {
+        return 0;
+    }
+
+    virtual size_t getMaxMemorySize(int id)
+    {
+        return 0;
+    }
+
+public:
+    MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug);
+
+    void setMaxMemorySize();
+
+    void *alloc(const size_t bytes, bool user_lock);
+
+    void unlock(void *ptr, bool user_unlock);
+
+    void garbageCollect();
+
+    void printInfo(const char *msg, const int device);
+
+    void bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers,
+                    size_t *lock_bytes,  size_t *lock_buffers);
+
+    void userLock(const void *ptr);
+
+    void userUnlock(const void *ptr);
+
+    size_t getMemStepSize();
+
+    size_t getMaxBytes();
+
+    unsigned getMaxBuffers();
+
+    void setMemStepSize(size_t new_step_size);
+
+    virtual void *nativeAlloc(const size_t bytes)
+    {
+        return malloc(bytes);
+    }
+
+    virtual void nativeFree(void *ptr)
+    {
+        free((void *)ptr);
+    }
+
+    virtual ~MemoryManager()
+    {
+    }
+
+    bool checkMemoryLimit();
+
+protected:
+    mutex_t memory_mutex;
+
+};
+
+}
diff --git a/src/backend/cblas.cpp b/src/backend/cblas.cpp
index 4d99d457c2..1be15e47c9 100644
--- a/src/backend/cblas.cpp
+++ b/src/backend/cblas.cpp
@@ -12,11 +12,11 @@
 #ifdef AF_CPU
     #include <blas.hpp>
 #else
-    #ifdef __APPLE__
-        #include <Accelerate/Accelerate.h>
+    #ifdef USE_MKL
+        #include <mkl_cblas.h>
     #else
-        #ifdef USE_MKL
-            #include <mkl_cblas.h>
+        #ifdef __APPLE__
+            #include <Accelerate/Accelerate.h>
         #else
             extern "C" {
                 #include <cblas.h>
diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp
index 8cf1b752f8..2c296d02d3 100644
--- a/src/backend/cpu/Array.cpp
+++ b/src/backend/cpu/Array.cpp
@@ -8,262 +8,241 @@
  ********************************************************/
 
 #include <af/dim4.hpp>
+#include <err_common.hpp>
 #include <Array.hpp>
 #include <copy.hpp>
+#include <kernel/Array.hpp>
 #include <TNJ/BufferNode.hpp>
 #include <TNJ/ScalarNode.hpp>
 #include <memory.hpp>
 #include <platform.hpp>
+#include <queue.hpp>
 #include <cstring>
 #include <cstddef>
+#include <MemoryManager.hpp>
 
 namespace cpu
 {
-    const int MAX_TNJ_LEN = 20;
-    using TNJ::BufferNode;
-    using TNJ::Node;
-    using TNJ::Node_ptr;
-
-    using af::dim4;
-
-    template<typename T>
-    Array<T>::Array(dim4 dims):
-        info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
-        data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
-    { }
-
-    template<typename T>
-    Array<T>::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device):
-        info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
-        data((is_device & !copy_device) ? (T*)in_data : memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
-    {
-        static_assert(std::is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
-        static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
-        if (!is_device || copy_device) {
-            std::copy(in_data, in_data + dims.elements(), data.get());
-        }
-    }
-
 
-    template<typename T>
-    Array<T>::Array(af::dim4 dims, TNJ::Node_ptr n) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
-        data(), data_dims(dims),
-        node(n), offset(0), ready(false), owner(true)
-    {
+const int MAX_TNJ_LEN = 20;
+using TNJ::BufferNode;
+using TNJ::Node;
+using TNJ::Node_ptr;
+
+using af::dim4;
+
+template<typename T>
+Array<T>::Array(dim4 dims):
+    info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+    data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
+    node(), ready(true), owner(true)
+{ }
+
+template<typename T>
+Array<T>::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device):
+    info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+    data((is_device & !copy_device) ? (T*)in_data : memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
+    node(), ready(true), owner(true)
+{
+    static_assert(std::is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
+    static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
+    if (!is_device || copy_device) {
+        std::copy(in_data, in_data + dims.elements(), data.get());
     }
+}
 
-    template<typename T>
-    Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) :
-        info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits<T>::af_type),
-        data(parent.getData()), data_dims(parent.getDataDims()),
-        node(),
-        offset(parent.getOffset() + calcOffset(parent.strides(), offsets)),
-        ready(true), owner(false)
-    { }
-
-    template<typename T>
-    std::shared_ptr<T> evalNodes(const int &num,
-                                 const dim4 &odims,
-                                 const dim4 &ostrs,
-                                 TNJ::Node_ptr &node)
-    {
-
-        std::shared_ptr<T> data(memAlloc<T>(num), memFree<T>);
-        T *ptr = data.get();
-
-        bool is_linear = node->isLinear(odims.get());
-
-        if (is_linear) {
-            for (int i = 0; i < num; i++) {
-                ptr[i] = *(T *)node->calc(i);
-            }
-        } else {
-            for (int w = 0; w < (int)odims[3]; w++) {
-                dim_t offw = w * ostrs[3];
-
-                for (int z = 0; z < (int)odims[2]; z++) {
-                    dim_t offz = z * ostrs[2] + offw;
-
-                    for (int y = 0; y < (int)odims[1]; y++) {
-                        dim_t offy = y * ostrs[1] + offz;
-
-                        for (int x = 0; x < (int)odims[0]; x++) {
-                            dim_t id = x + offy;
-
-                            ptr[id] = *(T *)node->calc(x, y, z, w);
-                        }
-                    }
-                }
-            }
-        }
-
-        return data;
-    }
+template<typename T>
+Array<T>::Array(af::dim4 dims, TNJ::Node_ptr n) :
+    info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+    data(), data_dims(dims),
+    node(n), ready(false), owner(true)
+{
+}
 
-    template<typename T>
-    void Array<T>::eval()
-    {
-        if (isReady()) return;
+template<typename T>
+Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim_t &offset_, const dim4 &strides) :
+    info(parent.getDevId(), dims, offset_, strides, (af_dtype)dtype_traits<T>::af_type),
+    data(parent.getData()), data_dims(parent.getDataDims()),
+    node(),
+    ready(true), owner(false)
+{ }
+
+template<typename T>
+Array<T>::Array(af::dim4 dims, af::dim4 strides, dim_t offset_,
+                const T * const in_data, bool is_device) :
+    info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits<T>::af_type),
+    data(is_device ? (T*)in_data : memAlloc<T>(info.total()), memFree<T>),
+    data_dims(dims),
+    node(),
+    ready(true),
+    owner(true)
+{
+    if (!is_device) {
+        std::copy(in_data, in_data + info.total(), data.get());
+    }
+}
 
-        this->setId(getActiveDeviceId());
+template<typename T>
+void Array<T>::eval()
+{
+    if (isReady()) return;
+    if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL);
 
-        data = evalNodes<T>(elements(), dims(), strides(), node);
+    this->setId(getActiveDeviceId());
 
-        ready = true;
-        Node_ptr prev = node;
-        prev->reset();
-        // FIXME: Replace the current node in any JIT possible trees with the new BufferNode
-        node.reset();
-    }
+    data = std::shared_ptr<T>(memAlloc<T>(elements()), memFree<T>);
 
-    template<typename T>
-    void Array<T>::eval() const
-    {
-        if (isReady()) return;
-        const_cast<Array<T> *>(this)->eval();
-    }
+    getQueue().enqueue(kernel::evalArray<T>, *this);
 
-    template<typename T>
-    Node_ptr Array<T>::getNode() const
-    {
-        if (!node) {
+    ready = true;
+    Node_ptr prev = node;
+    prev->reset();
+    // FIXME: Replace the current node in any JIT possible trees with the new BufferNode
+    node.reset();
+}
 
-            unsigned bytes = this->getDataDims().elements() * sizeof(T);
+template<typename T>
+void Array<T>::eval() const
+{
+    if (isReady()) return;
+    const_cast<Array<T> *>(this)->eval();
+}
 
-            BufferNode<T> *buf_node = new BufferNode<T>(data,
-                                                        bytes,
-                                                        offset,
-                                                        dims().get(),
-                                                        strides().get(),
-                                                        isLinear());
+template<typename T>
+Node_ptr Array<T>::getNode() const
+{
+    if (!node) {
 
-            const_cast<Array<T> *>(this)->node = Node_ptr(reinterpret_cast<Node *>(buf_node));
-        }
+        unsigned bytes = this->getDataDims().elements() * sizeof(T);
 
-        return node;
-    }
+        BufferNode<T> *buf_node = new BufferNode<T>(data,
+                                                    bytes,
+                                                    getOffset(),
+                                                    dims().get(),
+                                                    strides().get(),
+                                                    isLinear());
 
-    template<typename T>
-    Array<T>
-    createHostDataArray(const dim4 &size, const T * const data)
-    {
-        return Array<T>(size, data, false);
+        const_cast<Array<T> *>(this)->node = Node_ptr(reinterpret_cast<Node *>(buf_node));
     }
 
-    template<typename T>
-    Array<T>
-    createDeviceDataArray(const dim4 &size, const void *data)
-    {
-        return Array<T>(size, (const T * const) data, true);
-    }
+    return node;
+}
 
-    template<typename T>
-    Array<T>
-    createValueArray(const dim4 &size, const T& value)
-    {
-        TNJ::ScalarNode<T> *node = new TNJ::ScalarNode<T>(value);
-        return createNodeArray<T>(size, TNJ::Node_ptr(
-                                      reinterpret_cast<TNJ::Node *>(node)));
-    }
+template<typename T>
+Array<T>
+createHostDataArray(const dim4 &size, const T * const data)
+{
+    return Array<T>(size, data, false);
+}
 
-    template<typename T>
-    Array<T>
-    createEmptyArray(const dim4 &size)
-    {
-        return Array<T>(size);
-    }
+template<typename T>
+Array<T>
+createDeviceDataArray(const dim4 &size, const void *data)
+{
+    return Array<T>(size, (const T * const) data, true);
+}
 
-    template<typename T>
-    Array<T> *initArray() { return new Array<T>(dim4(0, 0, 0, 0)); }
+template<typename T>
+Array<T>
+createValueArray(const dim4 &size, const T& value)
+{
+    TNJ::ScalarNode<T> *node = new TNJ::ScalarNode<T>(value);
+    return createNodeArray<T>(size, TNJ::Node_ptr(
+                                  reinterpret_cast<TNJ::Node *>(node)));
+}
 
+template<typename T>
+Array<T>
+createEmptyArray(const dim4 &size)
+{
+    return Array<T>(size);
+}
 
-    template<typename T>
-    Array<T>
-    createNodeArray(const dim4 &dims, Node_ptr node)
-    {
-        Array<T> out =  Array<T>(dims, node);
+template<typename T>
+Array<T> *initArray() { return new Array<T>(dim4(0, 0, 0, 0)); }
 
-        unsigned length =0, buf_count = 0, bytes = 0;
+template<typename T>
+Array<T>
+createNodeArray(const dim4 &dims, Node_ptr node)
+{
+    Array<T> out =  Array<T>(dims, node);
 
-        Node *n = node.get();
-        n->getInfo(length, buf_count, bytes);
-        n->reset();
+    unsigned length =0, buf_count = 0, bytes = 0;
 
-        if (length > MAX_TNJ_LEN ||
-            buf_count >= MAX_BUFFERS ||
-            bytes >= MAX_BYTES) {
-            out.eval();
-        }
+    Node *n = node.get();
+    n->getInfo(length, buf_count, bytes);
+    n->reset();
 
-        return out;
+    if (length > getMaxJitSize() ||
+        buf_count >= getMaxBuffers() ||
+        bytes >= getMaxBytes()) {
+        out.eval();
     }
 
+    return out;
+}
 
-    template<typename T>
-    Array<T> createSubArray(const Array<T>& parent,
-                            const std::vector<af_seq> &index,
-                            bool copy)
-    {
-        parent.eval();
-
-        dim4 dDims = parent.getDataDims();
-        dim4 pDims = parent.dims();
+template<typename T>
+Array<T> createSubArray(const Array<T>& parent,
+                        const std::vector<af_seq> &index,
+                        bool copy)
+{
+    parent.eval();
 
-        dim4 dims   = toDims  (index, pDims);
-        dim4 offset = toOffset(index, dDims);
-        dim4 stride = toStride (index, dDims);
+    dim4 dDims = parent.getDataDims();
+    dim4 pDims = parent.dims();
 
-        Array<T> out = Array<T>(parent, dims, offset, stride);
+    dim4 dims    = toDims  (index, pDims);
+    dim4 strides = toStride (index, dDims);
 
-        if (!copy) return out;
+    // Find total offsets after indexing
+    dim4 offsets = toOffset(index, pDims);
+    dim4 parent_strides = parent.strides();
+    dim_t offset = parent.getOffset();
+    for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i];
 
-        if (stride[0] != 1 ||
-            stride[1] <  0 ||
-            stride[2] <  0 ||
-            stride[3] <  0) {
+    Array<T> out = Array<T>(parent, dims, offset, strides);
 
-            out = copyArray(out);
-        }
+    if (!copy) return out;
 
-        return out;
-    }
+    if (strides[0] != 1 ||
+        strides[1] <  0 ||
+        strides[2] <  0 ||
+        strides[3] <  0) {
 
-    template<typename T>
-    void
-    destroyArray(Array<T> *A)
-    {
-        delete A;
+        out = copyArray(out);
     }
 
+    return out;
+}
 
-    template<typename T>
-    void evalArray(const Array<T> &A)
-    {
-        A.eval();
-    }
+template<typename T>
+void
+destroyArray(Array<T> *A)
+{
+    delete A;
+}
 
-    template<typename T>
-    void
-    writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
-    {
-        if(!arr.isOwner()) {
-            arr = createEmptyArray<T>(arr.dims());
-        }
-        memcpy(arr.get() + arr.getOffset(), data, bytes);
+template<typename T>
+void
+writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
+{
+    if(!arr.isOwner()) {
+        arr = copyArray<T>(arr);
     }
+    arr.eval();
+    memcpy(arr.get(), data, bytes);
+}
 
-    template<typename T>
-    void
-    writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
-    {
-        if(!arr.isOwner()) {
-            arr = createEmptyArray<T>(arr.dims());
-        }
-        memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes);
+template<typename T>
+void
+writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
+{
+    if(!arr.isOwner()) {
+        arr = copyArray<T>(arr);
     }
+    memcpy(arr.get(), (const T * const)data, bytes);
+}
 
 #define INSTANTIATE(T)                                                  \
     template       Array<T>  createHostDataArray<T>   (const dim4 &size, const T * const data); \
@@ -275,26 +254,29 @@ namespace cpu
                                                        const std::vector<af_seq> &index, \
                                                        bool copy);      \
     template       void      destroyArray<T>          (Array<T> *A);    \
-    template       void      evalArray<T>             (const Array<T> &A); \
     template       Array<T>  createNodeArray<T>       (const dim4 &size, TNJ::Node_ptr node); \
     template       void Array<T>::eval();                               \
     template       void Array<T>::eval() const;                         \
     template       Array<T>::Array(af::dim4 dims, const T * const in_data, \
                                    bool is_device, bool copy_device);   \
+    template       Array<T>::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \
+                                   const T * const in_data,             \
+                                   bool is_device);                     \
     template       TNJ::Node_ptr Array<T>::getNode() const;             \
     template       void      writeHostDataArray<T>    (Array<T> &arr, const T * const data, const size_t bytes); \
     template       void      writeDeviceDataArray<T>  (Array<T> &arr, const void * const data, const size_t bytes); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp
index 471a6741ea..2a3afcf617 100644
--- a/src/backend/cpu/Array.hpp
+++ b/src/backend/cpu/Array.hpp
@@ -20,6 +20,19 @@
 #include <memory>
 #include <algorithm>
 #include <vector>
+#include <platform.hpp>
+#include <queue.hpp>
+
+// cpu::Array class forward declaration
+namespace cpu
+{
+template<typename T> class Array;
+// kernel::evalArray fn forward declaration
+namespace kernel
+{
+template<typename T> void evalArray(cpu::Array<T> in);
+}
+}
 
 namespace cpu
 {
@@ -63,9 +76,6 @@ namespace cpu
                             const std::vector<af_seq> &index,
                             bool copy=true);
 
-    template<typename T>
-    void evalArray(const Array<T> &A);
-
     // Creates a new Array object on the heap and returns a reference to it.
     template<typename T>
     void destroyArray(Array<T> *A);
@@ -74,10 +84,16 @@ namespace cpu
     void *getDevicePtr(const Array<T>& arr)
     {
         T *ptr = arr.device();
-        memPop(ptr);
+        memLock(ptr);
         return (void *)ptr;
     }
 
+    template<typename T>
+    void *getRawPtr(const Array<T>& arr)
+    {
+        return (void *)(arr.get(false));
+    }
+
     // Array Array Implementation
     template<typename T>
     class Array
@@ -90,18 +106,22 @@ namespace cpu
         af::dim4 data_dims;
         TNJ::Node_ptr node;
 
-        dim_t offset;
         bool ready;
         bool owner;
 
         Array() = default;
         Array(dim4 dims);
+
         explicit Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device=false);
-        Array(const Array<T>& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride);
+        Array(const Array<T>& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride);
         explicit Array(af::dim4 dims, TNJ::Node_ptr n);
 
     public:
 
+
+        Array(af::dim4 dims, af::dim4 strides, dim_t offset,
+              const T * const in_data, bool is_device = false);
+
         void resetInfo(const af::dim4& dims)        { info.resetInfo(dims);         }
         void resetDims(const af::dim4& dims)        { info.resetDims(dims);         }
         void modDims(const af::dim4 &newDims)       { info.modDims(newDims);        }
@@ -112,7 +132,6 @@ namespace cpu
     RET_TYPE NAME() const { return info.NAME(); }
 
         INFO_FUNC(const af_dtype& ,getType)
-        INFO_FUNC(const af::dim4& ,offsets)
         INFO_FUNC(const af::dim4& ,strides)
         INFO_FUNC(size_t          ,elements)
         INFO_FUNC(size_t          ,ndims)
@@ -150,7 +169,7 @@ namespace cpu
         void eval();
         void eval() const;
 
-        dim_t getOffset() const { return offset; }
+        dim_t getOffset() const { return info.getOffset(); }
         shared_ptr<T> getData() const {return data; }
 
         dim4 getDataDims() const
@@ -160,8 +179,14 @@ namespace cpu
             return isOwner() ? info.dims() : data_dims;
         }
 
+        void setDataDims(const dim4 &new_dims)
+        {
+            data_dims = new_dims;
+        }
+
         T* device()
         {
+            getQueue().sync();
             if (!isOwner() || data.use_count() > 1) {
                 *this = Array<T>(dims(), get(), true, true);
             }
@@ -181,7 +206,7 @@ namespace cpu
         const T* get(bool withOffset = true) const
         {
             if (!isReady()) eval();
-            return data.get() + (withOffset ? offset : 0);
+            return data.get() + (withOffset ? getOffset() : 0);
         }
 
         int useCount() const
@@ -204,9 +229,11 @@ namespace cpu
                                           const std::vector<af_seq> &index,
                                           bool copy);
 
+        friend void kernel::evalArray<T>(Array<T> in);
+
         friend void destroyArray<T>(Array<T> *arr);
-        friend void evalArray<T>(const Array<T> &arr);
         friend void *getDevicePtr<T>(const Array<T>& arr);
+        friend void *getRawPtr<T>(const Array<T>& arr);
     };
 
 }
diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt
index 6ab66245a0..9387323592 100644
--- a/src/backend/cpu/CMakeLists.txt
+++ b/src/backend/cpu/CMakeLists.txt
@@ -3,14 +3,25 @@ ADD_DEFINITIONS(-DAF_CPU)
 
 FIND_PACKAGE(CBLAS REQUIRED)
 
+OPTION(BUILD_CPU_ASYNC "Build CPU backend with ASYNC support" ON)
+
+IF (NOT ${BUILD_CPU_ASYNC})
+    ADD_DEFINITIONS(-DAF_DISABLE_CPU_ASYNC)
+ENDIF()
+
 IF(USE_CPU_F77_BLAS)
     MESSAGE("Using F77 BLAS")
     ADD_DEFINITIONS(-DUSE_F77_BLAS)
 ENDIF()
 
-IF(USE_CPU_MKL)
-    MESSAGE("Using MKL")
+IF(USE_CPU_MKL) # Manual MKL Setup
+    MESSAGE("CPU Backend Using MKL")
     ADD_DEFINITIONS(-DUSE_MKL)
+ELSE(USE_CPU_MKL)
+    IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS
+        MESSAGE("CPU Backend Using MKL RT")
+        ADD_DEFINITIONS(-DUSE_MKL)
+    ENDIF()
 ENDIF()
 
 IF (NOT CBLAS_LIBRARIES)
@@ -23,16 +34,20 @@ IF(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU" AND "${APPLE}")
     ADD_DEFINITIONS(-flax-vector-conversions)
 ENDIF()
 
-IF(${MKL_FOUND})
-    ADD_DEFINITIONS(-DUSE_MKL)
-ENDIF()
-
 FIND_PACKAGE(FFTW REQUIRED)
 MESSAGE(STATUS "FFTW Found ? ${FFTW_FOUND}")
 MESSAGE(STATUS "FFTW Library: ${FFTW_LIBRARIES}")
 
 IF(APPLE)
-    FIND_PACKAGE(LAPACK)
+    FIND_PACKAGE(LAPACKE QUIET) # For finding MKL
+    IF(NOT LAPACK_FOUND)
+        # UNSET THE VARIABLES FROM LAPACKE
+        UNSET(LAPACKE_LIB CACHE)
+        UNSET(LAPACK_LIB CACHE)
+        UNSET(LAPACKE_INCLUDES CACHE)
+        UNSET(LAPACKE_ROOT_DIR CACHE)
+        FIND_PACKAGE(LAPACK)
+    ENDIF()
 ELSE(APPLE) # Linux and Windows
     FIND_PACKAGE(LAPACKE)
 ENDIF(APPLE)
@@ -41,15 +56,30 @@ IF(NOT LAPACK_FOUND)
     MESSAGE(WARNING "LAPACK not found. Functionality will be disabled")
 ELSE(NOT LAPACK_FOUND)
     ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA)
+    MESSAGE(STATUS "LAPACK libraries found: ${LAPACK_LIBRARIES}")
 ENDIF()
 
 IF(NOT UNIX)
     ADD_DEFINITIONS(-DAFDLL)
 ENDIF()
 
+SET(THREADS_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/threads")
+IF(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}"
+        AND EXISTS "${THREADS_SRC_DIR}/LICENSE")
+    # threads submodule has been initialized
+    # Nothing to do
+ELSE()
+    MESSAGE(STATUS "threads submodule unavailable. Updating submodules.")
+    EXECUTE_PROCESS(
+        COMMAND git submodule update --init --recursive
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+        )
+ENDIF()
+
 INCLUDE_DIRECTORIES(
     ${CMAKE_INCLUDE_PATH}
     "${CMAKE_SOURCE_DIR}/src/backend/cpu"
+    "${CMAKE_SOURCE_DIR}/src/backend/cpu/threads"
     ${FFTW_INCLUDES}
     ${CBLAS_INCLUDE_DIR}
     )
diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp
index 2d3beae942..b817b840b4 100644
--- a/src/backend/cpu/approx.cpp
+++ b/src/backend/cpu/approx.cpp
@@ -9,329 +9,70 @@
 
 #include <Array.hpp>
 #include <approx.hpp>
-#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
+#include <kernel/approx1.hpp>
+#include <kernel/approx2.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Approx1
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename Ty, typename Tp, af_interp_type method>
-    struct approx1_op
-    {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            return;
-        }
-    };
 
-    template<typename Ty, typename Tp>
-    struct approx1_op<Ty, Tp, AF_INTERP_NEAREST>
-    {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            dim_t pmId = idx;
-            if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
-
-            const Tp x = pos[pmId];
-            bool gFlag = false;
-            if (x < 0 || idims[0] < x+1) {  // No need to check y
-                gFlag = true;
-            }
-
-            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                             + idy * ostrides[1] + idx;
-            if(gFlag) {
-                out[omId] = scalar<Ty>(offGrid);
-            } else {
-                dim_t ioff = idw * istrides[3] + idz * istrides[2]
-                           + idy * istrides[1];
-                const dim_t iMem = round(x) + ioff;
-
-                out[omId] = in[iMem];
-            }
-        }
-    };
-
-    template<typename Ty, typename Tp>
-    struct approx1_op<Ty, Tp, AF_INTERP_LINEAR>
-    {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            dim_t pmId = idx;
-            if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
-
-            const Tp x = pos[pmId];
-            bool gFlag = false;
-            if (x < 0 || idims[0] < x+1) {
-                gFlag = true;
-            }
-
-            const dim_t grid_x = floor(x);  // nearest grid
-            const Tp off_x = x - grid_x; // fractional offset
-
-            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                             + idy * ostrides[1] + idx;
-            if(gFlag) {
-                out[omId] = scalar<Ty>(offGrid);
-            } else {
-                dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x;
-
-                // Check if x and x + 1 are both valid indices
-                bool cond = (x < idims[0] - 1);
-                // Compute Left and Right Weighted Values
-                Ty yl = ((Tp)1.0 - off_x) * in[ioff];
-                Ty yr = cond ? (off_x) * in[ioff + 1] : scalar<Ty>(0);
-                Ty yo = yl + yr;
-                // Compute Weight used
-                Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x);
-                // Write final value
-                out[omId] = (yo / wt);
-            }
-        }
-    };
-
-    template<typename Ty, typename Tp, af_interp_type method>
-    void approx1_(Ty *out, const af::dim4 &odims, const dim_t oElems,
-            const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-            const Tp *pos, const af::dim4 &pdims,
-            const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-            const float offGrid)
-    {
-        approx1_op<Ty, Tp, method> op;
-        bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1);
-
-        for(dim_t w = 0; w < odims[3]; w++) {
-            for(dim_t z = 0; z < odims[2]; z++) {
-                for(dim_t y = 0; y < odims[1]; y++) {
-                    for(dim_t x = 0; x < odims[0]; x++) {
-                        op(out, odims, oElems, in, idims, iElems, pos, pdims,
-                           ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w);
-                    }
-                }
-            }
-        }
-    }
-
-    template<typename Ty, typename Tp>
-    Array<Ty> approx1(const Array<Ty> &in, const Array<Tp> &pos,
-                       const af_interp_type method, const float offGrid)
-    {
-        af::dim4 odims = in.dims();
-        odims[0] = pos.dims()[0];
-
-        // Create output placeholder
-        Array<Ty> out = createEmptyArray<Ty>(odims);
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                approx1_<Ty, Tp, AF_INTERP_NEAREST>
-                        (out.get(), out.dims(), out.elements(),
-                         in.get(), in.dims(), in.elements(), pos.get(), pos.dims(),
-                         out.strides(), in.strides(), pos.strides(), offGrid);
-                break;
-            case AF_INTERP_LINEAR:
-                approx1_<Ty, Tp, AF_INTERP_LINEAR>
-                        (out.get(), out.dims(), out.elements(),
-                         in.get(), in.dims(), in.elements(), pos.get(), pos.dims(),
-                         out.strides(), in.strides(), pos.strides(), offGrid);
-                break;
-            default:
-                break;
-        }
-        return out;
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Approx2
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename Ty, typename Tp, af_interp_type method>
-    struct approx2_op
-    {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                  const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            return;
-        }
-    };
-
-    template<typename Ty, typename Tp>
-    struct approx2_op<Ty, Tp, AF_INTERP_NEAREST>
-    {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                  const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            dim_t pmId = idy * pstrides[1] + idx;
-            dim_t qmId = idy * qstrides[1] + idx;
-            if(pBatch) {
-                pmId += idw * pstrides[3] + idz * pstrides[2];
-                qmId += idw * qstrides[3] + idz * qstrides[2];
-            }
-
-            bool gFlag = false;
-            const Tp x = pos[pmId], y = qos[qmId];
-            if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
-                gFlag = true;
-            }
-
-            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                             + idy * ostrides[1] + idx;
-            if(gFlag) {
-                out[omId] = scalar<Ty>(offGrid);
-            } else {
-                const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
-                const dim_t imId = idw * istrides[3] + idz * istrides[2] +
-                                grid_y * istrides[1] + grid_x;
-                out[omId] = in[imId];
-            }
-        }
-    };
-
-    template<typename Ty, typename Tp>
-    struct approx2_op<Ty, Tp, AF_INTERP_LINEAR>
-    {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                  const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            dim_t pmId = idy * pstrides[1] + idx;
-            dim_t qmId = idy * qstrides[1] + idx;
-            if(pBatch) {
-                pmId += idw * pstrides[3] + idz * pstrides[2];
-                qmId += idw * qstrides[3] + idz * qstrides[2];
-            }
-
-            bool gFlag = false;
-            const Tp x = pos[pmId], y = qos[qmId];
-            if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
-                gFlag = true;
-            }
-
-            const dim_t grid_x = floor(x),   grid_y = floor(y);   // nearest grid
-            const Tp off_x  = x - grid_x, off_y  = y - grid_y; // fractional offset
-
-            // Check if pVal and pVal + 1 are both valid indices
-            bool condY = (y < idims[1] - 1);
-            bool condX = (x < idims[0] - 1);
-
-            // Compute wieghts used
-            Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y);
-            Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0;
-            Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0;
-            Tp wt11 = (condX && condY) ? (off_x) * (off_y)  : 0;
-
-            Tp wt = wt00 + wt10 + wt01 + wt11;
-            Ty zero = scalar<Ty>(0);
-
-            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                             + idy * ostrides[1] + idx;
-            if(gFlag) {
-                out[omId] = scalar<Ty>(offGrid);
-            } else {
-                dim_t ioff = idw * istrides[3] + idz * istrides[2]
-                        + grid_y * istrides[1] + grid_x;
-
-                // Compute Weighted Values
-                Ty y00 =                    wt00 * in[ioff];
-                Ty y10 = (condY) ?          wt10 * in[ioff + istrides[1]]     : zero;
-                Ty y01 = (condX) ?          wt01 * in[ioff + 1]               : zero;
-                Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero;
-
-                Ty yo = y00 + y10 + y01 + y11;
-
-                // Write Final Value
-                out[omId] = (yo / wt);
-            }
-        }
-    };
-
-    template<typename Ty, typename Tp, af_interp_type method>
-    void approx2_(Ty *out, const af::dim4 &odims, const dim_t oElems,
-            const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-            const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
-            const af::dim4 &ostrides, const af::dim4 &istrides,
-            const af::dim4 &pstrides, const af::dim4 &qstrides,
-            const float offGrid)
-    {
-        approx2_op<Ty, Tp, method> op;
-        bool pBatch = !(pdims[2] == 1 && pdims[3] == 1);
-
-        for(dim_t w = 0; w < odims[3]; w++) {
-            for(dim_t z = 0; z < odims[2]; z++) {
-                for(dim_t y = 0; y < odims[1]; y++) {
-                    for(dim_t x = 0; x < odims[0]; x++) {
-                        op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims,
-                           ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w);
-                    }
-                }
-            }
-        }
+template<typename Ty, typename Tp>
+Array<Ty> approx1(const Array<Ty> &in, const Array<Tp> &pos,
+                  const af_interp_type method, const float offGrid)
+{
+    in.eval();
+    pos.eval();
+
+    af::dim4 odims = in.dims();
+    odims[0] = pos.dims()[0];
+
+    Array<Ty> out = createEmptyArray<Ty>(odims);
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            getQueue().enqueue(kernel::approx1<Ty, Tp, AF_INTERP_NEAREST>,
+                               out, in, pos, offGrid);
+            break;
+        case AF_INTERP_LINEAR:
+            getQueue().enqueue(kernel::approx1<Ty, Tp, AF_INTERP_LINEAR>,
+                               out, in, pos, offGrid);
+            break;
+        default:
+            break;
     }
+    return out;
+}
 
-    template<typename Ty, typename Tp>
-    Array<Ty> approx2(const Array<Ty> &in, const Array<Tp> &pos0, const Array<Tp> &pos1,
-                       const af_interp_type method, const float offGrid)
-    {
-        af::dim4 odims = in.dims();
-        odims[0] = pos0.dims()[0];
-        odims[1] = pos0.dims()[1];
-
-        // Create output placeholder
-        Array<Ty> out = createEmptyArray<Ty>(odims);
 
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                approx2_<Ty, Tp, AF_INTERP_NEAREST>
-                        (out.get(), out.dims(), out.elements(),
-                         in.get(), in.dims(), in.elements(),
-                         pos0.get(), pos0.dims(), pos1.get(), pos1.dims(),
-                         out.strides(), in.strides(), pos0.strides(), pos1.strides(),
-                         offGrid);
-                break;
-            case AF_INTERP_LINEAR:
-                approx2_<Ty, Tp, AF_INTERP_LINEAR>
-                        (out.get(), out.dims(), out.elements(),
-                         in.get(), in.dims(), in.elements(),
-                         pos0.get(), pos0.dims(), pos1.get(), pos1.dims(),
-                         out.strides(), in.strides(), pos0.strides(), pos1.strides(),
-                         offGrid);
-                break;
-            default:
-                break;
-        }
-        return out;
+template<typename Ty, typename Tp>
+Array<Ty> approx2(const Array<Ty> &in, const Array<Tp> &pos0, const Array<Tp> &pos1,
+                  const af_interp_type method, const float offGrid)
+{
+    in.eval();
+    pos0.eval();
+    pos1.eval();
+
+    af::dim4 odims = in.dims();
+    odims[0] = pos0.dims()[0];
+    odims[1] = pos0.dims()[1];
+
+    Array<Ty> out = createEmptyArray<Ty>(odims);
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            getQueue().enqueue(kernel::approx2<Ty, Tp, AF_INTERP_NEAREST>,
+                               out, in, pos0, pos1, offGrid);
+            break;
+        case AF_INTERP_LINEAR:
+            getQueue().enqueue(kernel::approx2<Ty, Tp, AF_INTERP_LINEAR>,
+                               out, in, pos0, pos1, offGrid);
+            break;
+        default:
+            break;
     }
+    return out;
+}
 
 #define INSTANTIATE(Ty, Tp)                                                                    \
     template Array<Ty> approx1<Ty, Tp>(const Array<Ty> &in, const Array<Tp> &pos,              \
@@ -340,8 +81,9 @@ namespace cpu
                                        const Array<Tp> &pos1, const af_interp_type method,     \
                                        const float offGrid);                                   \
 
-    INSTANTIATE(float  , float )
-    INSTANTIATE(double , double)
-    INSTANTIATE(cfloat , float )
-    INSTANTIATE(cdouble, double)
+INSTANTIATE(float  , float )
+INSTANTIATE(double , double)
+INSTANTIATE(cfloat , float )
+INSTANTIATE(cdouble, double)
+
 }
diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp
index 623bd52ac7..463b30c733 100644
--- a/src/backend/cpu/assign.cpp
+++ b/src/backend/cpu/assign.cpp
@@ -12,34 +12,26 @@
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <handle.hpp>
+#include <kernel/assign.hpp>
 #include <assign.hpp>
-#include <err_cpu.hpp>
-
-using af::dim4;
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
 
-static inline
-dim_t trimIndex(int idx, const dim_t &len)
-{
-    int ret_val = idx;
-    int offset  = abs(ret_val)%len;
-    if (ret_val<0) {
-        ret_val = offset-1;
-    } else if (ret_val>=(int)len) {
-        ret_val = len-offset-1;
-    }
-    return ret_val;
-}
+using af::dim4;
+using std::vector;
 
 template<typename T>
 void assign(Array<T>& out, const af_index_t idxrs[], const Array<T>& rhs)
 {
-    bool isSeq[4];
-    std::vector<af_seq> seqs(4, af_span);
-    // create seq vector to retrieve output
-    // dimensions, offsets & offsets
+    out.eval();
+    rhs.eval();
+
+    vector<bool> isSeq(4);
+    vector<af_seq> seqs(4, af_span);
+    // create seq vector to retrieve output dimensions, offsets & offsets
     for (dim_t x=0; x<4; ++x) {
         if (idxrs[x].isSeq) {
             seqs[x] = idxrs[x].idx.seq;
@@ -47,68 +39,17 @@ void assign(Array<T>& out, const af_index_t idxrs[], const Array<T>& rhs)
         isSeq[x] = idxrs[x].isSeq;
     }
 
-    dim4 dDims = out.getDataDims();
-    dim4 pDims = out.dims();
-    // retrieve dimensions & strides for array
-    // to which rhs is being copied to
-    dim4 dst_offsets    = toOffset(seqs, dDims);
-    dim4 dst_strides    = toStride(seqs, dDims);
-    // retrieve rhs array dimenesions & strides
-    dim4 src_dims       = rhs.dims();
-    dim4 src_strides    = rhs.strides();
-
-    std::vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4()));
+    vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4()));
     // look through indexs to read af_array indexs
     for (dim_t x=0; x<4; ++x) {
         if (!isSeq[x]) {
             idxArrs[x] = castArray<uint>(idxrs[x].idx.arr);
+            idxArrs[x].eval();
         }
     }
 
-    // declare pointers to af_array index data
-    const uint* ptr0 = idxArrs[0].get();
-    const uint* ptr1 = idxArrs[1].get();
-    const uint* ptr2 = idxArrs[2].get();
-    const uint* ptr3 = idxArrs[3].get();
-
-    const T * src= rhs.get();
-    T * dst      = out.get();
-
-    for(dim_t l=0; l<src_dims[3]; ++l) {
-
-        dim_t src_loff = l*src_strides[3];
-
-        dim_t dst_lIdx = trimIndex(isSeq[3] ? l+dst_offsets[3] : ptr3[l], pDims[3]);
-        dim_t dst_loff = dst_lIdx * dst_strides[3];
-
-        for(dim_t k=0; k<src_dims[2]; ++k) {
-
-            dim_t src_koff = k*src_strides[2];
-
-            dim_t dst_kIdx = trimIndex(isSeq[2] ? k+dst_offsets[2] : ptr2[k], pDims[2]);
-            dim_t dst_koff = dst_kIdx * dst_strides[2];
-
-            for(dim_t j=0; j<src_dims[1]; ++j) {
-
-                dim_t src_joff = j*src_strides[1];
-
-                dim_t dst_jIdx = trimIndex(isSeq[1] ? j+dst_offsets[1] : ptr1[j], pDims[1]);
-                dim_t dst_joff = dst_jIdx * dst_strides[1];
-
-                for(dim_t i=0; i<src_dims[0]; ++i) {
-
-                    dim_t src_ioff = i*src_strides[0];
-                    dim_t src_idx  = src_ioff + src_joff + src_koff + src_loff;
-
-                    dim_t dst_iIdx = trimIndex(isSeq[0] ? i+dst_offsets[0] : ptr0[i], pDims[0]);
-                    dim_t dst_ioff = dst_iIdx * dst_strides[0];
-                    dim_t dst_idx  = dst_ioff + dst_joff + dst_koff + dst_loff;
-
-                    dst[dst_idx] = src[src_idx];
-                }
-            }
-        }
-    }
+    getQueue().enqueue(kernel::assign<T>, out, rhs, std::move(isSeq),
+            std::move(seqs), std::move(idxArrs));
 }
 
 #define INSTANTIATE(T) \
diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp
index 2d1e4dddff..abd985768d 100644
--- a/src/backend/cpu/bilateral.cpp
+++ b/src/backend/cpu/bilateral.cpp
@@ -11,89 +11,25 @@
 #include <af/defines.h>
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
+#include <kernel/bilateral.hpp>
 #include <bilateral.hpp>
 #include <cmath>
 #include <algorithm>
+#include <platform.hpp>
+#include <queue.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-static inline dim_t clamp(int a, dim_t mn, dim_t mx)
-{
-    return (a < (int)mn ? mn : (a > (int)mx ? mx : a));
-}
-
-static inline unsigned getIdx(const dim4 &strides,
-        int i, int j = 0, int k = 0, int l = 0)
-{
-    return (l * strides[3] +
-            k * strides[2] +
-            j * strides[1] +
-            i * strides[0]);
-}
-
 template<typename inType, typename outType, bool isColor>
 Array<outType> bilateral(const Array<inType> &in, const float &s_sigma, const float &c_sigma)
 {
+    in.eval();
     const dim4 dims     = in.dims();
-    const dim4 istrides = in.strides();
-
     Array<outType> out = createEmptyArray<outType>(dims);
-    const dim4 ostrides = out.strides();
-
-    outType *outData    = out.get();
-    const inType * inData = in.get();
-
-    // clamp spatical and chromatic sigma's
-    float space_          = std::min(11.5f, std::max(s_sigma, 0.f));
-    float color_          = std::max(c_sigma, 0.f);
-    const dim_t radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1);
-    const float svar      = space_*space_;
-    const float cvar      = color_*color_;
-
-    for(dim_t b3=0; b3<dims[3]; ++b3) {
-        // b3 for loop handles following batch configurations
-        //  - gfor
-        //  - input based batch
-        //      - when input is 4d array for color images
-        for(dim_t b2=0; b2<dims[2]; ++b2) {
-            // b2 for loop handles following batch configurations
-            //  - channels
-            //  - input based batch
-            //      - when input is 3d array for grayscale images
-            for(dim_t j=0; j<dims[1]; ++j) {
-                // j steps along 2nd dimension
-                for(dim_t i=0; i<dims[0]; ++i) {
-                    // i steps along 1st dimension
-                    outType norm = 0.0;
-                    outType res  = 0.0;
-                    const outType center = (outType)inData[getIdx(istrides, i, j)];
-                    for(dim_t wj=-radius; wj<=radius; ++wj) {
-                        // clamps offsets
-                        dim_t tj = clamp(j+wj, 0, dims[1]-1);
-                        for(dim_t wi=-radius; wi<=radius; ++wi) {
-                            // clamps offsets
-                            dim_t ti = clamp(i+wi, 0, dims[0]-1);
-                            // proceed
-                            const outType val= (outType)inData[getIdx(istrides, ti, tj)];
-                            const outType gauss_space = (wi*wi+wj*wj)/(-2.0*svar);
-                            const outType gauss_range = ((center-val)*(center-val))/(-2.0*cvar);
-                            const outType weight = std::exp(gauss_space+gauss_range);
-                            norm += weight;
-                            res += val*weight;
-                        }
-                    } // filter loop ends here
-
-                    outData[getIdx(ostrides, i, j)] = res/norm;
-                } //1st dimension loop ends here
-            } //2nd dimension loop ends here
-            outData += ostrides[2];
-            inData  += istrides[2];
-        }
-    }
-
+    getQueue().enqueue(kernel::bilateral<outType, inType, isColor>, out, in, s_sigma, c_sigma);
     return out;
 }
 
diff --git a/src/backend/cpu/blas.cpp b/src/backend/cpu/blas.cpp
index d01998bafb..3ecb502ffa 100644
--- a/src/backend/cpu/blas.cpp
+++ b/src/backend/cpu/blas.cpp
@@ -11,18 +11,20 @@
 #include <af/dim4.hpp>
 #include <handle.hpp>
 #include <cassert>
-#include <err_cpu.hpp>
 #include <err_common.hpp>
+#include <kernel/dot.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
 
-    using std::add_const;
-    using std::add_pointer;
-    using std::enable_if;
-    using std::is_floating_point;
-    using std::remove_const;
-    using std::conditional;
+using std::add_const;
+using std::add_pointer;
+using std::enable_if;
+using std::is_floating_point;
+using std::remove_const;
+using std::conditional;
 
 // Some implementations of BLAS require void* for complex pointers while others use float*/double*
 //
@@ -145,6 +147,9 @@ template<typename T>
 Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
                 af_mat_prop optLhs, af_mat_prop optRhs)
 {
+    lhs.eval();
+    rhs.eval();
+
     CBLAS_TRANSPOSE lOpts = toCblasTranspose(optLhs);
     CBLAS_TRANSPOSE rOpts = toCblasTranspose(optRhs);
 
@@ -158,77 +163,60 @@ Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
     int N = rDims[bColDim];
     int K = lDims[aColDim];
 
-    //FIXME: Leaks on errors.
-    Array<T> out = createEmptyArray<T>(af::dim4(M, N, 1, 1));
-    auto alpha = getScale<T, 1>();
-    auto beta  = getScale<T, 0>();
-
-    dim4 lStrides = lhs.strides();
-    dim4 rStrides = rhs.strides();
     using BT  =       typename blas_base<T>::type;
     using CBT = const typename blas_base<T>::type;
 
-    if(rDims[bColDim] == 1) {
-        N = lDims[aColDim];
-        gemv_func<T>()(
-            CblasColMajor, lOpts,
-            lDims[0], lDims[1],
-            alpha,
-            reinterpret_cast<CBT*>(lhs.get()), lStrides[1],
-            reinterpret_cast<CBT*>(rhs.get()), rStrides[0],
-            beta,
-            reinterpret_cast<BT*>(out.get()), 1);
-    } else {
-        gemm_func<T>()(
-            CblasColMajor, lOpts, rOpts,
-            M, N, K,
-            alpha,
-            reinterpret_cast<CBT*>(lhs.get()), lStrides[1],
-            reinterpret_cast<CBT*>(rhs.get()), rStrides[1],
-            beta,
-            reinterpret_cast<BT*>(out.get()), out.dims()[0]);
-    }
+    Array<T> out = createEmptyArray<T>(af::dim4(M, N, 1, 1));
+    auto func = [=] (Array<T> output, const Array<T> left, const Array<T> right) {
+        auto alpha = getScale<T, 1>();
+        auto beta  = getScale<T, 0>();
+
+        dim4 lStrides = left.strides();
+        dim4 rStrides = right.strides();
+
+        if(rDims[bColDim] == 1) {
+            gemv_func<T>()(
+                CblasColMajor, lOpts,
+                lDims[0], lDims[1],
+                alpha,
+                reinterpret_cast<CBT*>(left.get()), lStrides[1],
+                reinterpret_cast<CBT*>(right.get()), rStrides[0],
+                beta,
+                reinterpret_cast<BT*>(output.get()), 1);
+        } else {
+            gemm_func<T>()(
+                CblasColMajor, lOpts, rOpts,
+                M, N, K,
+                alpha,
+                reinterpret_cast<CBT*>(left.get()), lStrides[1],
+                reinterpret_cast<CBT*>(right.get()), rStrides[1],
+                beta,
+                reinterpret_cast<BT*>(output.get()), output.dims()[0]);
+        }
+    };
+    getQueue().enqueue(func, out, lhs, rhs);
 
     return out;
 }
 
-template<typename T> T
-conj(T  x) { return x; }
-
-template<> cfloat  conj<cfloat> (cfloat  c) { return std::conj(c); }
-template<> cdouble conj<cdouble>(cdouble c) { return std::conj(c); }
-
-template<typename T, bool conjugate, bool both_conjugate>
-Array<T> dot_(const Array<T> &lhs, const Array<T> &rhs,
-              af_mat_prop optLhs, af_mat_prop optRhs)
-{
-    int N = lhs.dims()[0];
-
-    T out = 0;
-    const T *pL = lhs.get();
-    const T *pR = rhs.get();
-
-    for(int i = 0; i < N; i++)
-        out += (conjugate ? cpu::conj(pL[i]) : pL[i]) * pR[i];
-
-    if(both_conjugate) out = cpu::conj(out);
-
-    return createValueArray(af::dim4(1), out);
-}
-
 template<typename T>
 Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
              af_mat_prop optLhs, af_mat_prop optRhs)
 {
+    lhs.eval();
+    rhs.eval();
+
+    Array<T> out = createEmptyArray<T>(af::dim4(1));
     if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) {
-        return dot_<T, false, true>(lhs, rhs, optLhs, optRhs);
+        getQueue().enqueue(kernel::dot<T, false, true>, out, lhs, rhs, optLhs, optRhs);
     } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) {
-        return dot_<T, true, false>(lhs, rhs, optLhs, optRhs);
+        getQueue().enqueue(kernel::dot<T, true, false>,out, lhs, rhs, optLhs, optRhs);
     } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) {
-        return dot_<T, true, false>(rhs, lhs, optRhs, optLhs);
+        getQueue().enqueue(kernel::dot<T, true, false>,out, rhs, lhs, optRhs, optLhs);
     } else {
-        return dot_<T, false, false>(lhs, rhs, optLhs, optRhs);
+        getQueue().enqueue(kernel::dot<T, false, false>,out, lhs, rhs, optLhs, optRhs);
     }
+    return out;
 }
 
 #undef BT
diff --git a/src/backend/cpu/blas.hpp b/src/backend/cpu/blas.hpp
index 117d3a2145..3f5b7451ad 100644
--- a/src/backend/cpu/blas.hpp
+++ b/src/backend/cpu/blas.hpp
@@ -10,17 +10,18 @@
 #include <af/defines.h>
 #include <af/blas.h>
 #include <Array.hpp>
+#include <types.hpp>
 
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#else
 #ifdef USE_MKL
-#include <mkl_cblas.h>
+    #include <mkl_cblas.h>
 #else
-extern "C" {
-#include <cblas.h>
-}
-#endif
+    #ifdef __APPLE__
+        #include <Accelerate/Accelerate.h>
+    #else
+        extern "C" {
+            #include <cblas.h>
+        }
+    #endif
 #endif
 
 // TODO: Ask upstream for a more official way to detect it
diff --git a/src/backend/cpu/cholesky.cpp b/src/backend/cpu/cholesky.cpp
index 57beaa4146..5e393f0082 100644
--- a/src/backend/cpu/cholesky.cpp
+++ b/src/backend/cpu/cholesky.cpp
@@ -18,8 +18,9 @@
 #include <cassert>
 #include <err_cpu.hpp>
 #include <triangle.hpp>
-
 #include <lapack_helper.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
@@ -46,6 +47,8 @@ CH_FUNC(potrf , cdouble, z)
 template<typename T>
 Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
 {
+    in.eval();
+
     Array<T> out = copyArray<T>(in);
     *info = cholesky_inplace(out, is_upper);
 
@@ -58,6 +61,8 @@ Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
 template<typename T>
 int cholesky_inplace(Array<T> &in, const bool is_upper)
 {
+    in.eval();
+
     dim4 iDims = in.dims();
     int N = iDims[0];
 
@@ -65,8 +70,13 @@ int cholesky_inplace(Array<T> &in, const bool is_upper)
     if(is_upper)
         uplo = 'U';
 
-    int info = potrf_func<T>()(AF_LAPACK_COL_MAJOR, uplo,
-                               N, in.get(), in.strides()[1]);
+    int info = 0;
+    auto func = [&] (int& info, Array<T>& in) {
+        info = potrf_func<T>()(AF_LAPACK_COL_MAJOR, uplo, N, in.get(), in.strides()[1]);
+    };
+
+    getQueue().enqueue(func, info, in);
+    getQueue().sync();
 
     return info;
 }
diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp
index 77d7daa5cd..8218a3f9a3 100644
--- a/src/backend/cpu/convolve.cpp
+++ b/src/backend/cpu/convolve.cpp
@@ -14,176 +14,23 @@
 #include <convolve.hpp>
 #include <err_cpu.hpp>
 #include <math.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/convolve.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<typename T, typename accT, bool expand>
-void one2one_1d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims,
-                dim4 const &sDims, dim4 const &fDims, dim4 const &sStrides)
-{
-    dim_t start = (expand ? 0 : fDims[0]/2);
-    dim_t end   = (expand ? oDims[0] : start + sDims[0]);
-    for(dim_t i=start; i<end; ++i) {
-        accT accum = 0.0;
-        for(dim_t f=0; f<fDims[0]; ++f) {
-            dim_t iIdx = i-f;
-            T s_val = ((iIdx>=0 &&iIdx<sDims[0])? iptr[iIdx*sStrides[0]] : T(0));
-            accum += accT(s_val * fptr[f]);
-        }
-        optr[i-start] = T(accum);
-    }
-}
-
-template<typename T, typename accT, bool expand>
-void one2one_2d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims,
-                dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides,
-                dim4 const &sStrides, dim4 const &fStrides)
-{
-    dim_t jStart = (expand ? 0 : fDims[1]/2);
-    dim_t jEnd   = (expand ? oDims[1] : jStart + sDims[1]);
-    dim_t iStart = (expand ? 0 : fDims[0]/2);
-    dim_t iEnd   = (expand ? oDims[0] : iStart + sDims[0]);
-
-    for(dim_t j=jStart; j<jEnd; ++j) {
-        dim_t joff = (j-jStart)*oStrides[1];
-
-        for(dim_t i=iStart; i<iEnd; ++i) {
-
-            accT accum = accT(0);
-            for(dim_t wj=0; wj<fDims[1]; ++wj) {
-                dim_t jIdx  = j-wj;
-                dim_t w_joff = wj*fStrides[1];
-                dim_t s_joff = jIdx * sStrides[1];
-                bool isJValid = (jIdx>=0 && jIdx<sDims[1]);
-
-                for(dim_t wi=0; wi<fDims[0]; ++wi) {
-                    dim_t iIdx = i-wi;
-
-                    T s_val = T(0);
-                    if ( isJValid && (iIdx>=0 && iIdx<sDims[0])) {
-                        s_val = iptr[s_joff+iIdx*sStrides[0]];
-                    }
-
-                    accum += accT(s_val * fptr[w_joff+wi*fStrides[0]]);
-                }
-            }
-            optr[joff+i-iStart] = T(accum);
-        }
-    }
-}
-
-template<typename T, typename accT, bool expand>
-void one2one_3d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims,
-                dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides,
-                dim4 const &sStrides, dim4 const &fStrides)
-{
-    dim_t kStart = (expand ? 0 : fDims[2]/2);
-    dim_t kEnd   = (expand ? oDims[2] : kStart + sDims[2]);
-    dim_t jStart = (expand ? 0 : fDims[1]/2);
-    dim_t jEnd   = (expand ? oDims[1] : jStart + sDims[1]);
-    dim_t iStart = (expand ? 0 : fDims[0]/2);
-    dim_t iEnd   = (expand ? oDims[0] : iStart + sDims[0]);
-
-    for(dim_t k=kStart; k<kEnd; ++k) {
-        dim_t koff = (k-kStart)*oStrides[2];
-
-        for(dim_t j=jStart; j<jEnd; ++j) {
-            dim_t joff = (j-jStart)*oStrides[1];
-
-            for(dim_t i=iStart; i<iEnd; ++i) {
-
-                accT accum = accT(0);
-                for(dim_t wk=0; wk<fDims[2]; ++wk) {
-                    dim_t kIdx  = k-wk;
-                    dim_t w_koff = wk*fStrides[2];
-                    dim_t s_koff = kIdx * sStrides[2];
-                    bool isKValid = (kIdx>=0 && kIdx<sDims[2]);
-
-                    for(dim_t wj=0; wj<fDims[1]; ++wj) {
-                        dim_t jIdx  = j-wj;
-                        dim_t w_joff = wj*fStrides[1];
-                        dim_t s_joff = jIdx * sStrides[1];
-                        bool isJValid = (jIdx>=0 && jIdx<sDims[1]);
-
-                        for(dim_t wi=0; wi<fDims[0]; ++wi) {
-                            dim_t iIdx = i-wi;
-
-                            T s_val = T(0);
-                            if ( isKValid && isJValid && (iIdx>=0 && iIdx<sDims[0])) {
-                                s_val = iptr[s_koff+s_joff+iIdx*sStrides[0]];
-                            }
-
-                            accum += accT(s_val * fptr[w_koff+w_joff+wi*fStrides[0]]);
-                        }
-                    }
-                }
-                optr[koff+joff+i-iStart] = T(accum);
-            } //i loop ends here
-        } // j loop ends here
-    } // k loop ends here
-}
-
-template<typename T, typename accT, dim_t baseDim, bool expand>
-void convolve_nd(T *optr, T const *iptr, accT const *fptr,
-                dim4 const &oDims, dim4 const &sDims, dim4 const &fDims,
-                dim4 const &oStrides, dim4 const &sStrides, dim4 const &fStrides,
-                ConvolveBatchKind kind)
-{
-    dim_t out_step[4]  = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
-    dim_t in_step[4]   = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
-    dim_t filt_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
-    dim_t batch[4]     = {0, 1, 1, 1}; /* first value is never used, and declared for code simplicity */
-
-    for (dim_t i=1; i<4; ++i) {
-        switch(kind) {
-            case CONVOLVE_BATCH_SIGNAL:
-                out_step[i] = oStrides[i];
-                in_step[i]  = sStrides[i];
-                if (i>=baseDim) batch[i] = sDims[i];
-                break;
-            case CONVOLVE_BATCH_SAME:
-                out_step[i]  = oStrides[i];
-                in_step[i]   = sStrides[i];
-                filt_step[i] = fStrides[i];
-                if (i>=baseDim) batch[i] = sDims[i];
-                break;
-            case CONVOLVE_BATCH_KERNEL:
-                out_step[i]  = oStrides[i];
-                filt_step[i] = fStrides[i];
-                if (i>=baseDim) batch[i] = fDims[i];
-                break;
-            default:
-                break;
-        }
-    }
-
-    for (dim_t b3=0; b3<batch[3]; ++b3) {
-        for (dim_t b2=0; b2<batch[2]; ++b2) {
-            for (dim_t b1=0; b1<batch[1]; ++b1) {
-
-                T * out          = optr + b1 * out_step[1] + b2 * out_step[2] + b3 * out_step[3];
-                T const *in      = iptr + b1 *  in_step[1] + b2 *  in_step[2] + b3 *  in_step[3];
-                accT const *filt = fptr + b1 *filt_step[1] + b2 *filt_step[2] + b3 *filt_step[3];
-
-                switch(baseDim) {
-                    case 1: one2one_1d<T, accT, expand>(out, in, filt, oDims, sDims, fDims, sStrides);                     break;
-                    case 2: one2one_2d<T, accT, expand>(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break;
-                    case 3: one2one_3d<T, accT, expand>(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break;
-                }
-            }
-        }
-    }
-}
-
 template<typename T, typename accT, dim_t baseDim, bool expand>
 Array<T> convolve(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind)
 {
+    signal.eval();
+    filter.eval();
+
     auto sDims    = signal.dims();
     auto fDims    = filter.dims();
-    auto sStrides = signal.strides();
 
     dim4 oDims(1);
     if (expand) {
@@ -204,99 +51,37 @@ Array<T> convolve(Array<T> const& signal, Array<accT> const& filter, ConvolveBat
 
     Array<T> out = createEmptyArray<T>(oDims);
 
-    convolve_nd<T, accT, baseDim, expand>(out.get(), signal.get(), filter.get(),
-            oDims, sDims, fDims, out.strides(), sStrides, filter.strides(), kind);
+    getQueue().enqueue(kernel::convolve_nd<T, accT, baseDim, expand>,out, signal, filter, kind);
 
     return out;
 }
 
-template<typename T, typename accT, dim_t conv_dim, bool expand>
-void convolve2_separable(T *optr, T const *iptr, accT const *fptr,
-                        dim4 const &oDims, dim4 const &sDims, dim4 const &orgDims, dim_t fDim,
-                        dim4 const &oStrides, dim4 const &sStrides, dim_t fStride)
-{
-    for(dim_t j=0; j<oDims[1]; ++j) {
-
-        dim_t jOff = j*oStrides[1];
-        dim_t cj = j + (conv_dim==1)*(expand ? 0: fDim>>1);
-
-        for(dim_t i=0; i<oDims[0]; ++i) {
-
-            dim_t iOff = i*oStrides[0];
-            dim_t ci = i + (conv_dim==0)*(expand ? 0 : fDim>>1);
-
-            accT accum = scalar<accT>(0);
-
-            for(dim_t f=0; f<fDim; ++f) {
-                T f_val = fptr[f];
-                T s_val;
-
-                if (conv_dim==0) {
-                    dim_t offi = ci - f;
-                    bool isCIValid = offi>=0 && offi<sDims[0];
-                    bool isCJValid = cj>=0 && cj<sDims[1];
-                    s_val = (isCJValid && isCIValid ? iptr[cj*sDims[0]+offi] : scalar<T>(0));
-                } else {
-                    dim_t offj = cj - f;
-                    bool isCIValid = ci>=0 && ci<sDims[0];
-                    bool isCJValid = offj>=0 && offj<sDims[1];
-                    s_val = (isCJValid && isCIValid ? iptr[offj*sDims[0]+ci] : scalar<T>(0));
-                }
-
-                accum += accT(s_val * f_val);
-            }
-            optr[iOff+jOff] = T(accum);
-        }
-    }
-}
-
 template<typename T, typename accT, bool expand>
 Array<T> convolve2(Array<T> const& signal, Array<accT> const& c_filter, Array<accT> const& r_filter)
 {
-    auto sDims    = signal.dims();
-    auto cfDims   = c_filter.dims();
-    auto rfDims   = r_filter.dims();
-    auto sStrides = signal.strides();
-
-    dim_t cflen = (dim_t)cfDims.elements();
-    dim_t rflen = (dim_t)rfDims.elements();
+    signal.eval();
+    c_filter.eval();
+    r_filter.eval();
 
+    auto sDims = signal.dims();
     dim4 tDims = sDims;
     dim4 oDims = sDims;
 
     if (expand) {
+        auto cfDims   = c_filter.dims();
+        auto rfDims   = r_filter.dims();
+
+        dim_t cflen = (dim_t)cfDims.elements();
+        dim_t rflen = (dim_t)rfDims.elements();
         // separable convolve only does CONVOLVE_BATCH_NONE and standard batch(CONVOLVE_BATCH_SIGNAL)
         tDims[0] += cflen - 1;
         oDims[0] += cflen - 1;
         oDims[1] += rflen - 1;
     }
 
-    Array<T> temp = createEmptyArray<T>(tDims);
     Array<T> out  = createEmptyArray<T>(oDims);
-    auto tStrides = temp.strides();
-    auto oStrides = out.strides();
-
-    for (dim_t b3=0; b3<oDims[3]; ++b3) {
-
-        dim_t i_b3Off = b3*sStrides[3];
-        dim_t t_b3Off = b3*tStrides[3];
-        dim_t o_b3Off = b3*oStrides[3];
-
-        for (dim_t b2=0; b2<oDims[2]; ++b2) {
 
-            T const *iptr = signal.get()+ b2*sStrides[2] + i_b3Off;
-            T *tptr = temp.get() + b2*tStrides[2] + t_b3Off;
-            T *optr = out.get()  + b2*oStrides[2] + o_b3Off;
-
-            convolve2_separable<T, accT, 0, expand>(tptr, iptr, c_filter.get(),
-                    tDims, sDims, sDims, cflen,
-                    tStrides, sStrides, c_filter.strides()[0]);
-
-            convolve2_separable<T, accT, 1, expand>(optr, tptr, r_filter.get(),
-                    oDims, tDims, sDims, rflen,
-                    oStrides, tStrides, r_filter.strides()[0]);
-        }
-    }
+    getQueue().enqueue(kernel::convolve2<T, accT, expand>, out, signal, c_filter, r_filter, tDims);
 
     return out;
 }
diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp
index 87e4480a36..27e80f8afb 100644
--- a/src/backend/cpu/copy.cpp
+++ b/src/backend/cpu/copy.cpp
@@ -18,139 +18,79 @@
 #include <cassert>
 #include <err_cpu.hpp>
 #include <math.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/copy.hpp>
 
 namespace cpu
 {
-    template<typename T>
-    static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim)
-    {
-        if(dim == 0) {
-            if(strides[dim] == 1) {
-                //FIXME: Check for errors / exceptions
-                memcpy(dst, src, dims[dim] * sizeof(T));
-            } else {
-                for(dim_t i = 0; i < dims[dim]; i++) {
-                    dst[i] = src[strides[dim]*i];
-                }
-            }
-        } else {
-            for(dim_t i = dims[dim]; i > 0; i--) {
-                stridedCopy<T>(dst, ostrides, src, dims, strides, dim - 1);
-                src += strides[dim];
-                dst += ostrides[dim];
-            }
-        }
-    }
-
-    // Assigns to single elements
-    template<typename T>
-    void copyData(T *to, const Array<T> &from)
-    {
-        if(from.isOwner()) {
-            // FIXME: Check for errors / exceptions
-            memcpy(to, from.get(), from.elements()*sizeof(T));
-        } else {
-            dim4 ostrides = calcStrides(from.dims());
-            stridedCopy<T>(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1);
-        }
-    }
-
-    template<typename T>
-    Array<T> copyArray(const Array<T> &A)
-    {
-        Array<T> out = createEmptyArray<T>(A.dims());
-        copyData(out.get(), A);
-        return out;
-    }
 
-    template<typename inType, typename outType>
-    static void copy(Array<outType> &dst, const Array<inType> &src, outType default_value, double factor)
-    {
-        dim4 src_dims       = src.dims();
-        dim4 dst_dims       = dst.dims();
-        dim4 src_strides    = src.strides();
-        dim4 dst_strides    = dst.strides();
-
-        const inType * src_ptr = src.get();
-        outType * dst_ptr      = dst.get();
-
-        dim_t trgt_l = std::min(dst_dims[3], src_dims[3]);
-        dim_t trgt_k = std::min(dst_dims[2], src_dims[2]);
-        dim_t trgt_j = std::min(dst_dims[1], src_dims[1]);
-        dim_t trgt_i = std::min(dst_dims[0], src_dims[0]);
-
-        for(dim_t l=0; l<dst_dims[3]; ++l) {
-
-            dim_t src_loff = l*src_strides[3];
-            dim_t dst_loff = l*dst_strides[3];
-            bool isLvalid = l<trgt_l;
-
-            for(dim_t k=0; k<dst_dims[2]; ++k) {
-
-                dim_t src_koff = k*src_strides[2];
-                dim_t dst_koff = k*dst_strides[2];
-                bool isKvalid = k<trgt_k;
-
-                for(dim_t j=0; j<dst_dims[1]; ++j) {
-
-                    dim_t src_joff = j*src_strides[1];
-                    dim_t dst_joff = j*dst_strides[1];
-                    bool isJvalid = j<trgt_j;
-
-                    for(dim_t i=0; i<dst_dims[0]; ++i) {
-                        outType temp = default_value;
-                        if (isLvalid && isKvalid && isJvalid && i<trgt_i) {
-                            dim_t src_idx = i*src_strides[0] + src_joff + src_koff + src_loff;
-                            temp = outType(src_ptr[src_idx])*outType(factor);
-                        }
-                        dim_t dst_idx = i*dst_strides[0] + dst_joff + dst_koff + dst_loff;
-                        dst_ptr[dst_idx] = temp;
-                    }
-                }
-            }
-        }
+template<typename T>
+void copyData(T *to, const Array<T> &from)
+{
+    from.eval();
+    getQueue().sync();
+    if(from.isLinear()) {
+        // FIXME: Check for errors / exceptions
+        memcpy(to, from.get(), from.elements()*sizeof(T));
+    } else {
+        dim4 ostrides = calcStrides(from.dims());
+        kernel::stridedCopy<T>(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1);
     }
+}
 
-    template<typename T>
-    void multiply_inplace(Array<T> &in, double val)
-    {
-        copy<T, T>(in, in, 0, val);
-    }
+template<typename T>
+Array<T> copyArray(const Array<T> &A)
+{
+    A.eval();
+    Array<T> out = createEmptyArray<T>(A.dims());
+    getQueue().enqueue(kernel::copy<T, T>, out, A, scalar<T>(0), 1.0);
+    return out;
+}
 
-    template<typename inType, typename outType>
-    Array<outType>
-    padArray(Array<inType> const &in, dim4 const &dims,
-             outType default_value, double factor)
-    {
-        Array<outType> ret = createValueArray<outType>(dims, default_value);
-        copy<inType, outType>(ret, in, outType(default_value), factor);
-        return ret;
-    }
+template<typename T>
+void multiply_inplace(Array<T> &in, double val)
+{
+    in.eval();
+    getQueue().enqueue(kernel::copy<T, T>, in, in, 0, val);
+}
 
-    template<typename inType, typename outType>
-    void copyArray(Array<outType> &out, Array<inType> const &in)
-    {
-        copy<inType, outType>(out, in, scalar<outType>(0), 1.0);
-    }
+template<typename inType, typename outType>
+Array<outType> padArray(Array<inType> const &in, dim4 const &dims,
+                        outType default_value, double factor)
+{
+    Array<outType> ret = createValueArray<outType>(dims, default_value);
+    ret.eval();
+    in.eval();
+    getQueue().enqueue(kernel::copy<outType, inType>, ret, in, outType(default_value), factor);
+    return ret;
+}
 
+template<typename inType, typename outType>
+void copyArray(Array<outType> &out, Array<inType> const &in)
+{
+    out.eval();
+    in.eval();
+    getQueue().enqueue(kernel::copy<outType, inType>, out, in, scalar<outType>(0), 1.0);
+}
 
 #define INSTANTIATE(T)                                                  \
     template void      copyData<T> (T *data, const Array<T> &from);     \
     template Array<T>  copyArray<T>(const Array<T> &A);                 \
     template void      multiply_inplace<T> (Array<T> &in, double norm); \
 
-    INSTANTIATE(float  )
-    INSTANTIATE(double )
-    INSTANTIATE(cfloat )
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int    )
-    INSTANTIATE(uint   )
-    INSTANTIATE(uchar  )
-    INSTANTIATE(char   )
-    INSTANTIATE(intl   )
-    INSTANTIATE(uintl  )
-    INSTANTIATE(short  )
-    INSTANTIATE(ushort )
+INSTANTIATE(float  )
+INSTANTIATE(double )
+INSTANTIATE(cfloat )
+INSTANTIATE(cdouble)
+INSTANTIATE(int    )
+INSTANTIATE(uint   )
+INSTANTIATE(uchar  )
+INSTANTIATE(char   )
+INSTANTIATE(intl   )
+INSTANTIATE(uintl  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 
 #define INSTANTIATE_PAD_ARRAY(SRC_T)                                    \
@@ -179,16 +119,16 @@ namespace cpu
     template void copyArray<SRC_T, uchar  >(Array<uchar  > &dst, Array<SRC_T> const &src);  \
     template void copyArray<SRC_T, char   >(Array<char   > &dst, Array<SRC_T> const &src);
 
-    INSTANTIATE_PAD_ARRAY(float )
-    INSTANTIATE_PAD_ARRAY(double)
-    INSTANTIATE_PAD_ARRAY(int   )
-    INSTANTIATE_PAD_ARRAY(uint  )
-    INSTANTIATE_PAD_ARRAY(intl  )
-    INSTANTIATE_PAD_ARRAY(uintl )
-    INSTANTIATE_PAD_ARRAY(uchar )
-    INSTANTIATE_PAD_ARRAY(char  )
-    INSTANTIATE_PAD_ARRAY(ushort)
-    INSTANTIATE_PAD_ARRAY(short )
+INSTANTIATE_PAD_ARRAY(float )
+INSTANTIATE_PAD_ARRAY(double)
+INSTANTIATE_PAD_ARRAY(int   )
+INSTANTIATE_PAD_ARRAY(uint  )
+INSTANTIATE_PAD_ARRAY(intl  )
+INSTANTIATE_PAD_ARRAY(uintl )
+INSTANTIATE_PAD_ARRAY(uchar )
+INSTANTIATE_PAD_ARRAY(char  )
+INSTANTIATE_PAD_ARRAY(ushort)
+INSTANTIATE_PAD_ARRAY(short )
 
 #define INSTANTIATE_PAD_ARRAY_COMPLEX(SRC_T)                            \
     template Array<cfloat > padArray<SRC_T, cfloat >(Array<SRC_T> const &src, dim4 const &dims, cfloat  default_value, double factor); \
@@ -196,8 +136,8 @@ namespace cpu
     template void copyArray<SRC_T, cfloat  >(Array<cfloat  > &dst, Array<SRC_T> const &src);    \
     template void copyArray<SRC_T, cdouble   >(Array<cdouble > &dst, Array<SRC_T> const &src);
 
-    INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat )
-    INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble)
+INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat )
+INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble)
 
 #define SPECILIAZE_UNUSED_COPYARRAY(SRC_T, DST_T) \
     template<> void copyArray<SRC_T, DST_T>(Array<DST_T> &out, Array<SRC_T> const &in) \
@@ -205,25 +145,25 @@ namespace cpu
         CPU_NOT_SUPPORTED();\
     }
 
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , double)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , float)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , char)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , int)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , short)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, char)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , double)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , float)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , char)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , int)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , short)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, char)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
 
 }
diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp
index d949a24437..c818f82795 100644
--- a/src/backend/cpu/diagonal.cpp
+++ b/src/backend/cpu/diagonal.cpp
@@ -10,83 +10,61 @@
 #include <af/array.h>
 #include <af/dim4.hpp>
 #include <af/defines.h>
+#include <handle.hpp>
 #include <Array.hpp>
 #include <diagonal.hpp>
 #include <math.hpp>
 #include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/diagonal.hpp>
 
 namespace cpu
 {
-    template<typename T>
-    Array<T> diagCreate(const Array<T> &in, const int num)
-    {
-        int size = in.dims()[0] + std::abs(num);
-        int batch = in.dims()[1];
-        Array<T> out = createEmptyArray<T>(dim4(size, size, batch));
 
-        const T *iptr = in.get();
-        T *optr = out.get();
-
-        for (int k = 0; k < batch; k++) {
-            for (int j = 0; j < size; j++) {
-                for (int i = 0; i < size; i++) {
-                    T val = scalar<T>(0);
-                    if (i == j - num) {
-                        val = (num > 0) ? iptr[i] : iptr[j];
-                    }
-                    optr[i + j * out.strides()[1]] = val;
-                }
-            }
-            optr += out.strides()[2];
-            iptr += in.strides()[1];
-        }
-
-        return out;
-    }
+template<typename T>
+Array<T> diagCreate(const Array<T> &in, const int num)
+{
+    in.eval();
 
-    template<typename T>
-    Array<T> diagExtract(const Array<T> &in, const int num)
-    {
-        const dim_t *idims = in.dims().get();
-        dim_t size = std::max(idims[0], idims[1]) - std::abs(num);
-        Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));
+    int size = in.dims()[0] + std::abs(num);
+    int batch = in.dims()[1];
+    Array<T> out = createEmptyArray<T>(dim4(size, size, batch));
 
-        const dim_t *odims = out.dims().get();
+    getQueue().enqueue(kernel::diagCreate<T>, out, in, num);
 
-        const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num);
+    return out;
+}
 
-        for (int l = 0; l < (int)odims[3]; l++) {
+template<typename T>
+Array<T> diagExtract(const Array<T> &in, const int num)
+{
+    in.eval();
 
-            for (int k = 0; k < (int)odims[2]; k++) {
-                const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off;
-                T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2];
+    const dim4 idims = in.dims();
+    dim_t size = std::max(idims[0], idims[1]) - std::abs(num);
+    Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));
 
-                for (int i = 0; i < (int)odims[0]; i++) {
-                    T val = scalar<T>(0);
-                    if (i < idims[0] && i < idims[1]) val =  iptr[i * in.strides()[1] + i];
-                    optr[i] = val;
-                }
-            }
-        }
+    getQueue().enqueue(kernel::diagExtract<T>, out, in, num);
 
-        return out;
-    }
+    return out;
+}
 
 #define INSTANTIATE_DIAGONAL(T)                                          \
     template Array<T>  diagExtract<T>    (const Array<T> &in, const int num); \
     template Array<T>  diagCreate <T>    (const Array<T> &in, const int num);
 
-    INSTANTIATE_DIAGONAL(float)
-    INSTANTIATE_DIAGONAL(double)
-    INSTANTIATE_DIAGONAL(cfloat)
-    INSTANTIATE_DIAGONAL(cdouble)
-    INSTANTIATE_DIAGONAL(int)
-    INSTANTIATE_DIAGONAL(uint)
-    INSTANTIATE_DIAGONAL(intl)
-    INSTANTIATE_DIAGONAL(uintl)
-    INSTANTIATE_DIAGONAL(char)
-    INSTANTIATE_DIAGONAL(uchar)
-    INSTANTIATE_DIAGONAL(short)
-    INSTANTIATE_DIAGONAL(ushort)
+INSTANTIATE_DIAGONAL(float)
+INSTANTIATE_DIAGONAL(double)
+INSTANTIATE_DIAGONAL(cfloat)
+INSTANTIATE_DIAGONAL(cdouble)
+INSTANTIATE_DIAGONAL(int)
+INSTANTIATE_DIAGONAL(uint)
+INSTANTIATE_DIAGONAL(intl)
+INSTANTIATE_DIAGONAL(uintl)
+INSTANTIATE_DIAGONAL(char)
+INSTANTIATE_DIAGONAL(uchar)
+INSTANTIATE_DIAGONAL(short)
+INSTANTIATE_DIAGONAL(ushort)
 
 }
diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp
index 063a761baf..1e374e95da 100644
--- a/src/backend/cpu/diff.cpp
+++ b/src/backend/cpu/diff.cpp
@@ -9,117 +9,60 @@
 
 #include <Array.hpp>
 #include <diff.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/diff.hpp>
 
 namespace cpu
 {
-    unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0)
-    {
-        return (l * strides[3] +
-                k * strides[2] +
-                j * strides[1] +
-                i);
-    }
 
-    template<typename T>
-    Array<T>  diff1(const Array<T> &in, const int dim)
-    {
-        // Bool for dimension
-        bool is_dim0 = dim == 0;
-        bool is_dim1 = dim == 1;
-        bool is_dim2 = dim == 2;
-        bool is_dim3 = dim == 3;
-
-        // Decrement dimension of select dimension
-        af::dim4 dims = in.dims();
-        dims[dim]--;
-
-        // Create output placeholder
-        Array<T> outArray = createValueArray(dims, (T)0);
+template<typename T>
+Array<T>  diff1(const Array<T> &in, const int dim)
+{
+    in.eval();
 
-        // Get pointers to raw data
-        const T *inPtr = in.get();
-              T *outPtr = outArray.get();
+    // Decrement dimension of select dimension
+    af::dim4 dims = in.dims();
+    dims[dim]--;
 
-        // TODO: Improve this
-        for(dim_t l = 0; l < dims[3]; l++) {
-            for(dim_t k = 0; k < dims[2]; k++) {
-                for(dim_t j = 0; j < dims[1]; j++) {
-                    for(dim_t i = 0; i < dims[0]; i++) {
-                        // Operation: out[index] = in[index + 1 * dim_size] - in[index]
-                        int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
-                        int jdx = getIdx(in.strides(), in.offsets(),
-                                         i + is_dim0, j + is_dim1,
-                                         k + is_dim2, l + is_dim3);
-                        int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
-                        outPtr[odx] = inPtr[jdx] - inPtr[idx];
-                    }
-                }
-            }
-        }
+    Array<T> outArray = createEmptyArray<T>(dims);
 
-        return outArray;
-    }
+    getQueue().enqueue(kernel::diff1<T>, outArray, in, dim);
 
-    template<typename T>
-    Array<T>  diff2(const Array<T> &in, const int dim)
-    {
-        // Bool for dimension
-        bool is_dim0 = dim == 0;
-        bool is_dim1 = dim == 1;
-        bool is_dim2 = dim == 2;
-        bool is_dim3 = dim == 3;
+    return outArray;
+}
 
-        // Decrement dimension of select dimension
-        af::dim4 dims = in.dims();
-        dims[dim] -= 2;
+template<typename T>
+Array<T>  diff2(const Array<T> &in, const int dim)
+{
+    in.eval();
 
-        // Create output placeholder
-        Array<T> outArray = createValueArray(dims, (T)0);
+    // Decrement dimension of select dimension
+    af::dim4 dims = in.dims();
+    dims[dim] -= 2;
 
-        // Get pointers to raw data
-        const T *inPtr = in.get();
-              T *outPtr = outArray.get();
+    Array<T> outArray = createEmptyArray<T>(dims);
 
-        // TODO: Improve this
-        for(dim_t l = 0; l < dims[3]; l++) {
-            for(dim_t k = 0; k < dims[2]; k++) {
-                for(dim_t j = 0; j < dims[1]; j++) {
-                    for(dim_t i = 0; i < dims[0]; i++) {
-                        // Operation: out[index] = in[index + 1 * dim_size] - in[index]
-                        int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
-                        int jdx = getIdx(in.strides(), in.offsets(),
-                                         i + is_dim0, j + is_dim1,
-                                         k + is_dim2, l + is_dim3);
-                        int kdx = getIdx(in.strides(), in.offsets(),
-                                         i + 2 * is_dim0, j + 2 * is_dim1,
-                                         k + 2 * is_dim2, l + 2 * is_dim3);
-                        int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
-                        outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx];
-                    }
-                }
-            }
-        }
+    getQueue().enqueue(kernel::diff2<T>, outArray, in, dim);
 
-        return outArray;
-    }
+    return outArray;
+}
 
 #define INSTANTIATE(T)                                                  \
     template Array<T>  diff1<T>  (const Array<T> &in, const int dim);   \
     template Array<T>  diff2<T>  (const Array<T> &in, const int dim);   \
 
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(ushort)
+INSTANTIATE(short)
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(ushort)
-    INSTANTIATE(short)
 }
diff --git a/src/backend/cpu/exampleFunction.cpp b/src/backend/cpu/exampleFunction.cpp
index a9e7bca9eb..0eb86462e1 100644
--- a/src/backend/cpu/exampleFunction.cpp
+++ b/src/backend/cpu/exampleFunction.cpp
@@ -24,6 +24,13 @@ namespace cpu
 template<typename T>
 Array<T> exampleFunction(const Array<T> &in, const af_someenum_t method)
 {
+    in.eval();                          // All input Arrays should call eval mandatorily
+                                        // in CPU backend function implementations. Since
+                                        // the cpu fns are asynchronous launches, any Arrays
+                                        // that are either views/JIT nodes needs to evaluated
+                                        // before they are passed onto functions that are
+                                        // enqueued onto the queues.
+
     dim4 outputDims;                    // this should be '= in.dims();' in most cases
                                         // but would definitely depend on the type of
                                         // algorithm you are implementing.
@@ -37,7 +44,7 @@ Array<T> exampleFunction(const Array<T> &in, const af_someenum_t method)
 
     //dim4 in_dims    = in.dims();        // you can retrieve dimensions
 
-    //dim4 in_offsets = in.offsets();     // you can retrieve offsets - used when given array
+    //dim_t in_offset = in.getOffset(); // you can retrieve the offset - used when given array
                                         // is an sub-array pointing to some other array and
                                         // doesn't have memory of its own
 
@@ -70,4 +77,3 @@ INSTANTIATE(cfloat)
 INSTANTIATE(cdouble)
 
 }
-
diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp
index 1c8069c24d..954f457cf4 100644
--- a/src/backend/cpu/fast.cpp
+++ b/src/backend/cpu/fast.cpp
@@ -14,240 +14,23 @@
 #include <err_cpu.hpp>
 #include <handle.hpp>
 #include <fast.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/fast.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-inline int clamp(int f, int a, int b)
-{
-    return std::max(a, std::min(f, b));
-}
-
-inline int idx_y(int i)
-{
-    if (i >= 8)
-        return clamp(-(i-8-4), -3, 3);
-
-    return clamp(i-4, -3, 3);
-}
-
-inline int idx_x(int i)
-{
-    if (i < 12)
-        return idx_y(i+4);
-
-    return idx_y(i-12);
-}
-
-inline int idx(int y, int x, unsigned idim0)
-{
-    return x * idim0 + y;
-}
-
-// test_greater()
-// Tests if a pixel x > p + thr
-inline int test_greater(float x, float p, float thr)
-{
-    return (x >= p + thr);
-}
-
-// test_smaller()
-// Tests if a pixel x < p - thr
-inline int test_smaller(float x, float p, float thr)
-{
-    return (x <= p - thr);
-}
-
-// test_pixel()
-// Returns -1 when x < p - thr
-// Returns  0 when x >= p - thr && x <= p + thr
-// Returns  1 when x > p + thr
-template<typename T>
-inline int test_pixel(const T* image, const float p, float thr, int y, int x, unsigned idim0)
-{
-    return -test_smaller((float)image[idx(y,x,idim0)], p, thr) | test_greater((float)image[idx(y,x,idim0)], p, thr);
-}
-
-// abs_diff()
-// Returns absolute difference of x and y
-inline int abs_diff(int x, int y)
-{
-    return abs(x - y);
-}
-inline unsigned abs_diff(unsigned x, unsigned y)
-{
-    return (unsigned)abs((int)x - (int)y);
-}
-inline float abs_diff(float x, float y)
-{
-    return fabs(x - y);
-}
-inline double abs_diff(double x, double y)
-{
-    return fabs(x - y);
-}
-
-template<typename T>
-void locate_features(
-    const Array<T> &in,
-    Array<float> &score,
-    Array<float> &x_out,
-    Array<float> &y_out,
-    Array<float> &score_out,
-    unsigned* count,
-    const float thr,
-    const unsigned arc_length,
-    const unsigned nonmax,
-    const unsigned max_feat,
-    const unsigned edge)
-{
-    dim4 in_dims = in.dims();
-    const T* in_ptr = in.get();
-
-    for (int y = edge; y < (int)(in_dims[0] - edge); y++) {
-        for (int x = edge; x < (int)(in_dims[1] - edge); x++) {
-            float p = in_ptr[idx(y, x, in_dims[0])];
-
-            // Start by testing opposite pixels of the circle that will result in
-            // a non-kepoint
-            int d;
-            d  = test_pixel<T>(in_ptr, p, thr, y-3,   x, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+3,   x, in_dims[0]);
-            if (d == 0)
-                continue;
-
-            d &= test_pixel<T>(in_ptr, p, thr, y-2, x+2, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+2, x-2, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y  , x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y  , x-3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y+2, x+2, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-2, x-2, in_dims[0]);
-            if (d == 0)
-                continue;
-
-            d &= test_pixel<T>(in_ptr, p, thr, y-3, x+1, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+3, x-1, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y-1, x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+1, x-3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y+1, x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-1, x-3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y+3, x+1, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-3, x-1, in_dims[0]);
-            if (d == 0)
-                continue;
-
-            int sum = 0;
-
-            // Sum responses [-1, 0 or 1] of first arc_length pixels
-            for (int i = 0; i < static_cast<int>(arc_length); i++)
-                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
-
-            // Test maximum and mininmum responses of first segment of arc_length
-            // pixels
-            int max_sum = 0, min_sum = 0;
-            max_sum = std::max(max_sum, sum);
-            min_sum = std::min(min_sum, sum);
-
-            // Sum responses and test the remaining 16-arc_length pixels of the circle
-            for (int i = arc_length; i < 16; i++) {
-                sum -= test_pixel<T>(in_ptr, p, thr, y+idx_y(i-arc_length), x+idx_x(i-arc_length), in_dims[0]);
-                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
-                max_sum = std::max(max_sum, sum);
-                min_sum = std::min(min_sum, sum);
-            }
-
-            // To completely test all possible segments, it's necessary to test
-            // segments that include the top junction of the circle
-            for (int i = 0; i < static_cast<int>(arc_length-1); i++) {
-                sum -= test_pixel<T>(in_ptr, p, thr, y+idx_y(16-arc_length+i), x+idx_x(16-arc_length+i), in_dims[0]);
-                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
-                max_sum = std::max(max_sum, sum);
-                min_sum = std::min(min_sum, sum);
-            }
-
-            float s_bright = 0, s_dark = 0;
-            for (int i = 0; i < 16; i++) {
-                float p_x = (float)in_ptr[idx(y+idx_y(i), x+idx_x(i), in_dims[0])];
-
-                s_bright += test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr);
-                s_dark   += test_smaller(p_x, p, thr) * (abs_diff(p, p_x) - thr);
-            }
-
-            // If sum at some point was equal to (+-)arc_length, there is a segment
-            // that for which all pixels are much brighter or much brighter than
-            // central pixel p.
-            if (max_sum == static_cast<int>(arc_length) || min_sum == -static_cast<int>(arc_length)) {
-                unsigned j = *count;
-                ++*count;
-                if (j < max_feat) {
-                    float *x_out_ptr = x_out.get();
-                    float *y_out_ptr = y_out.get();
-                    float *score_out_ptr = score_out.get();
-                    x_out_ptr[j]     = static_cast<float>(x);
-                    y_out_ptr[j]     = static_cast<float>(y);
-                    score_out_ptr[j] = static_cast<float>(std::max(s_bright, s_dark));
-                    if (nonmax == 1) {
-                        float* score_ptr = score.get();
-                        score_ptr[idx(y, x, in_dims[0])] = std::max(s_bright, s_dark);
-                    }
-                }
-            }
-        }
-    }
-}
-
-void non_maximal(
-    const Array<float> &score,
-    const Array<float> &x_in,
-    const Array<float> &y_in,
-    Array<float> &x_out,
-    Array<float> &y_out,
-    Array<float> &score_out,
-    unsigned* count,
-    const unsigned total_feat,
-    const unsigned edge)
-{
-    const float *score_ptr = score.get();
-    const float *x_in_ptr = x_in.get();
-    const float *y_in_ptr = y_in.get();
-
-    dim4 score_dims = score.dims();
-
-    for (unsigned k = 0; k < total_feat; k++) {
-        unsigned x = static_cast<unsigned>(round(x_in_ptr[k]));
-        unsigned y = static_cast<unsigned>(round(y_in_ptr[k]));
-
-        float v = score_ptr[y + score_dims[0] * x];
-        float max_v;
-        max_v = std::max(score_ptr[y-1 + score_dims[0] * (x-1)], score_ptr[y-1 + score_dims[0] * x]);
-        max_v = std::max(max_v, score_ptr[y-1 + score_dims[0] * (x+1)]);
-        max_v = std::max(max_v, score_ptr[y   + score_dims[0] * (x-1)]);
-        max_v = std::max(max_v, score_ptr[y   + score_dims[0] * (x+1)]);
-        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x-1)]);
-        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x)  ]);
-        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x+1)]);
-
-        if (y >= score_dims[1] - edge - 1 || y <= edge + 1 ||
-            x >= score_dims[0] - edge - 1 || x <= edge + 1)
-            continue;
-
-        // Stores keypoint to feat_out if it's response is maximum compared to
-        // its 8-neighborhood
-        if (v > max_v) {
-            unsigned j = *count;
-            ++*count;
-
-            float *x_out_ptr = x_out.get();
-            float *y_out_ptr = y_out.get();
-            float *score_out_ptr = score_out.get();
-
-            x_out_ptr[j]     = static_cast<float>(x);
-            y_out_ptr[j]     = static_cast<float>(y);
-            score_out_ptr[j] = static_cast<float>(v);
-        }
-    }
-}
-
 template<typename T>
 unsigned fast(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
               const Array<T> &in, const float thr, const unsigned arc_length,
               const bool nonmax, const float feature_ratio,
               const unsigned edge)
 {
+    in.eval();
+
     dim4 in_dims = in.dims();
     const unsigned max_feat = ceil(in.elements() * feature_ratio);
 
@@ -257,7 +40,9 @@ unsigned fast(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
     if (nonmax == 1) {
         dim4 V_dims(in_dims[0], in_dims[1]);
         V = createValueArray<float>(V_dims, (float)0);
+        V.eval();
     }
+    getQueue().sync();
 
     // Arrays containing all features detected before non-maximal suppression.
     dim4 max_feat_dims(max_feat);
@@ -268,7 +53,7 @@ unsigned fast(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
     // Feature counter
     unsigned count = 0;
 
-    locate_features<T>(in, V, x, y, score, &count, thr, arc_length,
+    kernel::locate_features<T>(in, V, x, y, score, &count, thr, arc_length,
                        nonmax, max_feat, edge);
 
     // If more features than max_feat were detected, feat wasn't populated
@@ -282,13 +67,12 @@ unsigned fast(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
     Array<float> score_total = createEmptyArray<float>(af::dim4());
 
     if (nonmax == 1) {
-
         x_total     = createEmptyArray<float>(feat_found_dims);
         y_total     = createEmptyArray<float>(feat_found_dims);
         score_total = createEmptyArray<float>(feat_found_dims);
 
         count = 0;
-        non_maximal(V, x, y,
+        kernel::non_maximal(V, x, y,
                     x_total, y_total, score_total,
                     &count, feat_found, edge);
 
diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp
index e41c8a1658..3c1d10a4f3 100644
--- a/src/backend/cpu/fft.cpp
+++ b/src/backend/cpu/fft.cpp
@@ -12,151 +12,34 @@
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <fft.hpp>
-#include <err_cpu.hpp>
-#include <fftw3.h>
+#include <kernel/fft.hpp>
 #include <copy.hpp>
 #include <math.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<int rank>
-void computeDims(int rdims[rank], const dim4 &idims)
-{
-    for (int i = 0; i < rank; i++) {
-        rdims[i] = idims[(rank -1) - i];
-    }
-}
-
-template<typename T>
-struct fftw_transform;
-
-#define TRANSFORM(PRE, TY)                                              \
-    template<>                                                          \
-    struct fftw_transform<TY>                                           \
-    {                                                                   \
-        typedef PRE##_plan plan_t;                                      \
-        typedef PRE##_complex ctype_t;                                  \
-                                                                        \
-        template<typename... Args>                                      \
-            plan_t create(Args... args)                                 \
-        { return PRE##_plan_many_dft(args...); }                        \
-        void execute(plan_t plan) { return PRE##_execute(plan); }       \
-        void destroy(plan_t plan) { return PRE##_destroy_plan(plan); }  \
-    };                                                                  \
-
-
-TRANSFORM(fftwf, cfloat)
-TRANSFORM(fftw, cdouble)
-
 template<typename T, int rank, bool direction>
 void fft_inplace(Array<T> &in)
 {
-    int t_dims[rank];
-    int in_embed[rank];
-
-    const dim4 idims = in.dims();
-
-    computeDims<rank>(t_dims  , idims);
-    computeDims<rank>(in_embed , in.getDataDims());
-
-    const dim4 istrides = in.strides();
-
-    typedef typename fftw_transform<T>::ctype_t ctype_t;
-    typename fftw_transform<T>::plan_t plan;
-
-    fftw_transform<T> transform;
-
-    int batch = 1;
-    for (int i = rank; i < 4; i++) {
-        batch *= idims[i];
-    }
-
-    plan = transform.create(rank,
-                            t_dims,
-                            (int)batch,
-                            (ctype_t *)in.get(),
-                            in_embed, (int)istrides[0],
-                            (int)istrides[rank],
-                            (ctype_t *)in.get(),
-                            in_embed, (int)istrides[0],
-                            (int)istrides[rank],
-                            direction ? FFTW_FORWARD : FFTW_BACKWARD,
-                            FFTW_ESTIMATE);
-
-    transform.execute(plan);
-    transform.destroy(plan);
+    in.eval();
+    getQueue().enqueue(kernel::fft_inplace<T, rank, direction>, in);
 }
 
-template<typename To, typename Ti>
-struct fftw_real_transform;
-
-#define TRANSFORM_REAL(PRE, To, Ti, POST)                               \
-    template<>                                                          \
-    struct fftw_real_transform<To, Ti>                                  \
-    {                                                                   \
-        typedef PRE##_plan plan_t;                                      \
-        typedef PRE##_complex ctype_t;                                  \
-                                                                        \
-        template<typename... Args>                                      \
-            plan_t create(Args... args)                                 \
-        { return PRE##_plan_many_dft_##POST(args...); }                 \
-        void execute(plan_t plan) { return PRE##_execute(plan); }       \
-        void destroy(plan_t plan) { return PRE##_destroy_plan(plan); }  \
-    };                                                                  \
-
-
-TRANSFORM_REAL(fftwf, cfloat , float , r2c)
-TRANSFORM_REAL(fftw , cdouble, double, r2c)
-TRANSFORM_REAL(fftwf, float , cfloat , c2r)
-TRANSFORM_REAL(fftw , double, cdouble, c2r)
-
 template<typename Tc, typename Tr, int rank>
 Array<Tc> fft_r2c(const Array<Tr> &in)
 {
-    dim4 idims = in.dims();
-    dim4 odims = in.dims();
+    in.eval();
 
+    dim4 odims = in.dims();
     odims[0] = odims[0] / 2 + 1;
-
     Array<Tc> out = createEmptyArray<Tc>(odims);
 
-    int t_dims[rank];
-    int in_embed[rank];
-    int out_embed[rank];
-
-    computeDims<rank>(t_dims  , idims);
-    computeDims<rank>(in_embed , in.getDataDims());
-    computeDims<rank>(out_embed , out.getDataDims());
-
-    const dim4 istrides = in.strides();
-    const dim4 ostrides = out.strides();
-
-    typedef typename fftw_real_transform<Tc, Tr>::ctype_t ctype_t;
-    typename fftw_real_transform<Tc, Tr>::plan_t plan;
-
-    fftw_real_transform<Tc, Tr> transform;
-
-    int batch = 1;
-    for (int i = rank; i < 4; i++) {
-        batch *= idims[i];
-    }
-
-    plan = transform.create(rank,
-                            t_dims,
-                            (int)batch,
-                            (Tr *)in.get(),
-                            in_embed, (int)istrides[0],
-                            (int)istrides[rank],
-                            (ctype_t *)out.get(),
-                            out_embed, (int)ostrides[0],
-                            (int)ostrides[rank],
-                            FFTW_ESTIMATE);
-
-    transform.execute(plan);
-    transform.destroy(plan);
+    getQueue().enqueue(kernel::fft_r2c<Tc, Tr, rank>, out, in);
 
     return out;
 }
@@ -164,42 +47,11 @@ Array<Tc> fft_r2c(const Array<Tr> &in)
 template<typename Tr, typename Tc, int rank>
 Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
 {
-    Array<Tr> out = createEmptyArray<Tr>(odims);
+    in.eval();
 
-    int t_dims[rank];
-    int in_embed[rank];
-    int out_embed[rank];
-
-    computeDims<rank>(t_dims  , odims);
-    computeDims<rank>(in_embed , in.getDataDims());
-    computeDims<rank>(out_embed , out.getDataDims());
-
-    const dim4 istrides = in.strides();
-    const dim4 ostrides = out.strides();
-
-    typedef typename fftw_real_transform<Tr, Tc>::ctype_t ctype_t;
-    typename fftw_real_transform<Tr, Tc>::plan_t plan;
-
-    fftw_real_transform<Tr, Tc> transform;
-
-    int batch = 1;
-    for (int i = rank; i < 4; i++) {
-        batch *= odims[i];
-    }
-
-    plan = transform.create(rank,
-                            t_dims,
-                            (int)batch,
-                            (ctype_t *)in.get(),
-                            in_embed, (int)istrides[0],
-                            (int)istrides[rank],
-                            (Tr *)out.get(),
-                            out_embed, (int)ostrides[0],
-                            (int)ostrides[rank],
-                            FFTW_ESTIMATE);
+    Array<Tr> out = createEmptyArray<Tr>(odims);
+    getQueue().enqueue(kernel::fft_c2r<Tr, Tc, rank>, out, in, odims);
 
-    transform.execute(plan);
-    transform.destroy(plan);
     return out;
 }
 
@@ -211,8 +63,8 @@ Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
     template void fft_inplace<T, 2, false>(Array<T> &in);   \
     template void fft_inplace<T, 3, false>(Array<T> &in);
 
-    INSTANTIATE(cfloat )
-    INSTANTIATE(cdouble)
+INSTANTIATE(cfloat )
+INSTANTIATE(cdouble)
 
 #define INSTANTIATE_REAL(Tr, Tc)                                        \
     template Array<Tc> fft_r2c<Tc, Tr, 1>(const Array<Tr> &in);         \
@@ -222,6 +74,7 @@ Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
     template Array<Tr> fft_c2r<Tr, Tc, 2>(const Array<Tc> &in, const dim4 &odims); \
     template Array<Tr> fft_c2r<Tr, Tc, 3>(const Array<Tc> &in, const dim4 &odims); \
 
-    INSTANTIATE_REAL(float , cfloat )
-    INSTANTIATE_REAL(double, cdouble)
+INSTANTIATE_REAL(float , cfloat )
+INSTANTIATE_REAL(double, cdouble)
+
 }
diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp
index f76f3a0d3f..3b4b864452 100644
--- a/src/backend/cpu/fftconvolve.cpp
+++ b/src/backend/cpu/fftconvolve.cpp
@@ -17,208 +17,20 @@
 #include <fftw3.h>
 #include <copy.hpp>
 #include <convolve_common.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/fftconvolve.hpp>
 
 namespace cpu
 {
 
-template<typename To, typename Ti>
-void packData(To* out_ptr, const af::dim4& od, const af::dim4& os,
-              Array<Ti> const& in)
-{
-    const af::dim4 id = in.dims();
-    const af::dim4 is = in.strides();
-    const Ti* in_ptr = in.get();
-
-    int id0_half = divup(id[0], 2);
-    bool odd_id0 = (id[0] % 2 == 1);
-
-    for (int d3 = 0; d3 < (int)od[3]; d3++) {
-        for (int d2 = 0; d2 < (int)od[2]; d2++) {
-            for (int d1 = 0; d1 < (int)od[1]; d1++) {
-                for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
-                    const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
-
-                    if (d0 < (int)id0_half && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) {
-                        const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0;
-                        out_ptr[oidx]   = (To)in_ptr[iidx];
-                        if (d0 == id0_half-1 && odd_id0)
-                            out_ptr[oidx+1] = (To)0;
-                        else
-                            out_ptr[oidx+1] = (To)in_ptr[iidx+id0_half];
-                    }
-                    else {
-                        // Pad remaining elements with 0s
-                        out_ptr[oidx]   = (To)0;
-                        out_ptr[oidx+1] = (To)0;
-                    }
-                }
-            }
-        }
-    }
-}
-
-template<typename To, typename Ti>
-void padArray(To* out_ptr, const af::dim4& od, const af::dim4& os,
-              Array<Ti> const& in)
-{
-    const af::dim4 id = in.dims();
-    const af::dim4 is = in.strides();
-    const Ti* in_ptr = in.get();
-
-    for (int d3 = 0; d3 < (int)od[3]; d3++) {
-        for (int d2 = 0; d2 < (int)od[2]; d2++) {
-            for (int d1 = 0; d1 < (int)od[1]; d1++) {
-                for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
-                    const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
-
-                    if (d0 < (int)id[0] && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) {
-                        // Copy input elements to real elements, set imaginary elements to 0
-                        const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0;
-                        out_ptr[oidx]   = (To)in_ptr[iidx];
-                        out_ptr[oidx+1] = (To)0;
-                    }
-                    else {
-                        // Pad remaining of the matrix to 0s
-                        out_ptr[oidx]   = (To)0;
-                        out_ptr[oidx+1] = (To)0;
-                    }
-                }
-            }
-        }
-    }
-}
-
-template<typename T>
-void complexMultiply(T* out_ptr, const af::dim4& od, const af::dim4& os,
-                     T* in1_ptr, const af::dim4& i1d, const af::dim4& i1s,
-                     T* in2_ptr, const af::dim4& i2d, const af::dim4& i2s,
-                     ConvolveBatchKind kind)
-{
-    for (int d3 = 0; d3 < (int)od[3]; d3++) {
-        for (int d2 = 0; d2 < (int)od[2]; d2++) {
-            for (int d1 = 0; d1 < (int)od[1]; d1++) {
-                for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
-                    if (kind == CONVOLVE_BATCH_NONE || kind == CONVOLVE_BATCH_SAME) {
-                        // Complex multiply each signal to equivalent filter
-                        const int ridx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
-                        const int iidx = ridx + 1;
-
-                        T a = in1_ptr[ridx];
-                        T b = in1_ptr[iidx];
-                        T c = in2_ptr[ridx];
-                        T d = in2_ptr[iidx];
-
-                        T ac = a*c;
-                        T bd = b*d;
-
-                        out_ptr[ridx] = ac - bd;
-                        out_ptr[iidx] = (a+b) * (c+d) - ac - bd;
-                    }
-                    else if (kind == CONVOLVE_BATCH_SIGNAL) {
-                        // Complex multiply all signals to filter
-                        const int ridx1 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
-                        const int iidx1 = ridx1 + 1;
-                        const int ridx2 = ridx1 % (i2s[3] * i2d[3]);
-                        const int iidx2 = iidx1 % (i2s[3] * i2d[3]);
-
-                        T a = in1_ptr[ridx1];
-                        T b = in1_ptr[iidx1];
-                        T c = in2_ptr[ridx2];
-                        T d = in2_ptr[iidx2];
-
-                        T ac = a*c;
-                        T bd = b*d;
-
-                        out_ptr[ridx1] = ac - bd;
-                        out_ptr[iidx1] = (a+b) * (c+d) - ac - bd;
-                    }
-                    else if (kind == CONVOLVE_BATCH_KERNEL) {
-                        // Complex multiply signal to all filters
-                        const int ridx2 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
-                        const int iidx2 = ridx2 + 1;
-                        const int ridx1 = ridx2 % (i1s[3] * i1d[3]);
-                        const int iidx1 = iidx2 % (i1s[3] * i1d[3]);
-
-                        T a = in1_ptr[ridx1];
-                        T b = in1_ptr[iidx1];
-                        T c = in2_ptr[ridx2];
-                        T d = in2_ptr[iidx2];
-
-                        T ac = a*c;
-                        T bd = b*d;
-
-                        out_ptr[ridx2] = ac - bd;
-                        out_ptr[iidx2] = (a+b) * (c+d) - ac - bd;
-                    }
-                }
-            }
-        }
-    }
-}
-
-template<typename To, typename Ti, bool roundOut>
-void reorderOutput(To* out_ptr, const af::dim4& od, const af::dim4& os,
-                   const Ti* in_ptr, const af::dim4& id, const af::dim4& is,
-                   const af::dim4& fd, const int half_di0, const int baseDim,
-                   const int fftScale, const bool expand)
-{
-    for (int d3 = 0; d3 < (int)od[3]; d3++) {
-        for (int d2 = 0; d2 < (int)od[2]; d2++) {
-            for (int d1 = 0; d1 < (int)od[1]; d1++) {
-                for (int d0 = 0; d0 < (int)od[0]; d0++) {
-                    int id0, id1, id2, id3;
-                    if (expand) {
-                        id0 = d0;
-                        id1 = d1 * is[1];
-                        id2 = d2 * is[2];
-                        id3 = d3 * is[3];
-                    }
-                    else {
-                        id0 = d0 + fd[0]/2;
-                        id1 = (d1 + (baseDim > 1)*(fd[1]/2)) * is[1];
-                        id2 = (d2 + (baseDim > 2)*(fd[2]/2)) * is[2];
-                        id3 = d3 * is[3];
-                    }
-
-                    int oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0;
-
-                    // Divide output elements to cuFFT resulting scale, round result if output
-                    // type is single or double precision floating-point
-                    if (id0 < half_di0) {
-                        // Copy top elements
-                        int iidx = id3 + id2 + id1 + id0 * 2;
-                        if (roundOut)
-                            out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale));
-                        else
-                            out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale);
-                    }
-                    else if (id0 < half_di0 + (int)fd[0] - 1) {
-                        // Add signal and filter elements to central part
-                        int iidx1 = id3 + id2 + id1 + id0 * 2;
-                        int iidx2 = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1;
-                        if (roundOut)
-                            out_ptr[oidx] = (To)roundf((float)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale));
-                        else
-                            out_ptr[oidx] = (To)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale);
-                    }
-                    else {
-                        // Copy bottom elements
-                        const int iidx = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1;
-                        if (roundOut)
-                            out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale));
-                        else
-                            out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale);
-                    }
-                }
-            }
-        }
-    }
-}
-
 template<typename T, typename convT, typename cT, bool isDouble, bool roundOut, dim_t baseDim>
 Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
                      const bool expand, ConvolveBatchKind kind)
 {
+    signal.eval();
+    filter.eval();
+
     const af::dim4 sd = signal.dims();
     const af::dim4 fd = filter.dims();
 
@@ -249,9 +61,6 @@ Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
     packed_dims[baseDim] = (sbatch + fbatch);
 
     Array<convT> packed = createEmptyArray<convT>(packed_dims);
-    convT *packed_ptr = packed.get();
-
-    const af::dim4 packed_strides = packed.strides();
 
     sig_tmp_dims[0]    = filter_tmp_dims[0] = packed_dims[0];
     sig_tmp_strides[0] = filter_tmp_strides[0] = 1;
@@ -270,107 +79,117 @@ Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
         filter_tmp_strides[k] = filter_tmp_strides[k - 1] * filter_tmp_dims[k - 1];
     }
 
-    // Calculate memory offsets for packed signal and filter
-    convT *sig_tmp_ptr    = packed_ptr;
-    convT *filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3];
-
     // Number of packed complex elements in dimension 0
     dim_t sig_half_d0 = divup(sd[0], 2);
 
     // Pack signal in a complex matrix where first dimension is half the input
     // (allows faster FFT computation) and pad array to a power of 2 with 0s
-    packData<convT, T>(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, signal);
+    getQueue().enqueue(kernel::packData<convT, T>, packed, sig_tmp_dims, sig_tmp_strides, signal);
 
     // Pad filter array with 0s
-    padArray<convT, T>(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, filter);
-
-    // Compute forward FFT
-    if (isDouble) {
-        fftw_plan plan = fftw_plan_many_dft(baseDim,
-                                            fft_dims,
-                                            packed_dims[baseDim],
-                                            (fftw_complex*)packed.get(),
-                                            NULL,
-                                            packed_strides[0],
-                                            packed_strides[baseDim] / 2,
-                                            (fftw_complex*)packed.get(),
-                                            NULL,
-                                            packed_strides[0],
-                                            packed_strides[baseDim] / 2,
-                                            FFTW_FORWARD,
-                                            FFTW_ESTIMATE);
-
-        fftw_execute(plan);
-        fftw_destroy_plan(plan);
-    }
-    else {
-        fftwf_plan plan = fftwf_plan_many_dft(baseDim,
-                                              fft_dims,
-                                              packed_dims[baseDim],
-                                              (fftwf_complex*)packed.get(),
-                                              NULL,
-                                              packed_strides[0],
-                                              packed_strides[baseDim] / 2,
-                                              (fftwf_complex*)packed.get(),
-                                              NULL,
-                                              packed_strides[0],
-                                              packed_strides[baseDim] / 2,
-                                              FFTW_FORWARD,
-                                              FFTW_ESTIMATE);
-
-        fftwf_execute(plan);
-        fftwf_destroy_plan(plan);
-    }
+    const dim_t offset = sig_tmp_strides[3]*sig_tmp_dims[3];
+    getQueue().enqueue(kernel::padArray<convT, T>, packed, filter_tmp_dims, filter_tmp_strides,
+                       filter, offset);
+
+    dim4 fftDims(1, 1, 1, 1);
+    for (int i=0; i<baseDim; ++i)
+        fftDims[i] = fft_dims[i];
+
+    auto upstream_dft = [=] (Array<convT> packed, const dim4 fftDims) {
+        int fft_dims[baseDim];
+        for (int i=0; i<baseDim; ++i)
+            fft_dims[i] = fftDims[i];
+        const dim4 packed_dims = packed.dims();
+        const af::dim4 packed_strides = packed.strides();
+        // Compute forward FFT
+        if (isDouble) {
+            fftw_plan plan = fftw_plan_many_dft(baseDim,
+                                                fft_dims,
+                                                packed_dims[baseDim],
+                                                (fftw_complex*)packed.get(),
+                                                NULL,
+                                                packed_strides[0],
+                                                packed_strides[baseDim] / 2,
+                                                (fftw_complex*)packed.get(),
+                                                NULL,
+                                                packed_strides[0],
+                                                packed_strides[baseDim] / 2,
+                                                FFTW_FORWARD,
+                                                FFTW_ESTIMATE);
+
+            fftw_execute(plan);
+            fftw_destroy_plan(plan);
+        } else {
+            fftwf_plan plan = fftwf_plan_many_dft(baseDim,
+                                                  fft_dims,
+                                                  packed_dims[baseDim],
+                                                  (fftwf_complex*)packed.get(),
+                                                  NULL,
+                                                  packed_strides[0],
+                                                  packed_strides[baseDim] / 2,
+                                                  (fftwf_complex*)packed.get(),
+                                                  NULL,
+                                                  packed_strides[0],
+                                                  packed_strides[baseDim] / 2,
+                                                  FFTW_FORWARD,
+                                                  FFTW_ESTIMATE);
+
+            fftwf_execute(plan);
+            fftwf_destroy_plan(plan);
+        }
+    };
+    getQueue().enqueue(upstream_dft, packed, fftDims);
 
     // Multiply filter and signal FFT arrays
-    if (kind == CONVOLVE_BATCH_KERNEL)
-        complexMultiply<convT>(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides,
-                               sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides,
-                               filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides,
-                               kind);
-    else
-        complexMultiply<convT>(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides,
-                               sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides,
-                               filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides,
-                               kind);
-
-    // Compute inverse FFT
-    if (isDouble) {
-        fftw_plan plan = fftw_plan_many_dft(baseDim,
-                                            fft_dims,
-                                            packed_dims[baseDim],
-                                            (fftw_complex*)packed.get(),
-                                            NULL,
-                                            packed_strides[0],
-                                            packed_strides[baseDim] / 2,
-                                            (fftw_complex*)packed.get(),
-                                            NULL,
-                                            packed_strides[0],
-                                            packed_strides[baseDim] / 2,
-                                            FFTW_BACKWARD,
-                                            FFTW_ESTIMATE);
-
-        fftw_execute(plan);
-        fftw_destroy_plan(plan);
-    }
-    else {
-        fftwf_plan plan = fftwf_plan_many_dft(baseDim,
-                                              fft_dims,
-                                              packed_dims[baseDim],
-                                              (fftwf_complex*)packed.get(),
-                                              NULL,
-                                              packed_strides[0],
-                                              packed_strides[baseDim] / 2,
-                                              (fftwf_complex*)packed.get(),
-                                              NULL,
-                                              packed_strides[0],
-                                              packed_strides[baseDim] / 2,
-                                              FFTW_BACKWARD,
-                                              FFTW_ESTIMATE);
-
-        fftwf_execute(plan);
-        fftwf_destroy_plan(plan);
-    }
+    getQueue().enqueue(kernel::complexMultiply<convT>, packed,
+                       sig_tmp_dims, sig_tmp_strides,
+                       filter_tmp_dims, filter_tmp_strides,
+                       kind, offset);
+
+    auto upstream_idft = [=] (Array<convT> packed, const dim4 fftDims) {
+        int fft_dims[baseDim];
+        for (int i=0; i<baseDim; ++i)
+            fft_dims[i] = fftDims[i];
+        const dim4 packed_dims = packed.dims();
+        const af::dim4 packed_strides = packed.strides();
+        // Compute inverse FFT
+        if (isDouble) {
+            fftw_plan plan = fftw_plan_many_dft(baseDim,
+                                                fft_dims,
+                                                packed_dims[baseDim],
+                                                (fftw_complex*)packed.get(),
+                                                NULL,
+                                                packed_strides[0],
+                                                packed_strides[baseDim] / 2,
+                                                (fftw_complex*)packed.get(),
+                                                NULL,
+                                                packed_strides[0],
+                                                packed_strides[baseDim] / 2,
+                                                FFTW_BACKWARD,
+                                                FFTW_ESTIMATE);
+
+            fftw_execute(plan);
+            fftw_destroy_plan(plan);
+        } else {
+            fftwf_plan plan = fftwf_plan_many_dft(baseDim,
+                                                  fft_dims,
+                                                  packed_dims[baseDim],
+                                                  (fftwf_complex*)packed.get(),
+                                                  NULL,
+                                                  packed_strides[0],
+                                                  packed_strides[baseDim] / 2,
+                                                  (fftwf_complex*)packed.get(),
+                                                  NULL,
+                                                  packed_strides[0],
+                                                  packed_strides[baseDim] / 2,
+                                                  FFTW_BACKWARD,
+                                                  FFTW_ESTIMATE);
+
+            fftwf_execute(plan);
+            fftwf_destroy_plan(plan);
+        }
+    };
+    getQueue().enqueue(upstream_idft, packed, fftDims);
 
     // Compute output dimensions
     dim4 oDims(1);
@@ -391,25 +210,10 @@ Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
     }
 
     Array<T> out = createEmptyArray<T>(oDims);
-    T* out_ptr = out.get();
-    const af::dim4 out_dims = out.dims();
-    const af::dim4 out_strides = out.strides();
 
-    const af::dim4 filter_dims = filter.dims();
-
-    // Reorder the output
-    if (kind == CONVOLVE_BATCH_KERNEL) {
-        reorderOutput<T, convT, roundOut>
-            (out_ptr, out_dims, out_strides,
-             filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides,
-             filter_dims, sig_half_d0, baseDim, fftScale, expand);
-    }
-    else {
-        reorderOutput<T, convT, roundOut>
-            (out_ptr, out_dims, out_strides,
-             sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides,
-             filter_dims, sig_half_d0, baseDim, fftScale, expand);
-    }
+    getQueue().enqueue(kernel::reorder<T, convT, roundOut, baseDim>, out, packed, filter,
+                       sig_half_d0, fftScale, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims,
+                       filter_tmp_strides, expand, kind);
 
     return out;
 }
diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp
index 8ab2fe46fc..aa417f49e1 100644
--- a/src/backend/cpu/gradient.cpp
+++ b/src/backend/cpu/gradient.cpp
@@ -12,83 +12,29 @@
 #include <math.hpp>
 #include <stdexcept>
 #include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/gradient.hpp>
 
 namespace cpu
 {
-    template<typename T>
-    void gradient(Array<T> &grad0, Array<T> &grad1, const Array<T> &in)
-    {
-        const af::dim4 dims = in.dims();
 
-        T *d_grad0    = grad0.get();
-        T *d_grad1    = grad1.get();
-        const T *d_in = in.get();
-
-        const af::dim4 inst = in.strides();
-        const af::dim4 g0st = grad0.strides();
-        const af::dim4 g1st = grad1.strides();
-
-        T v5 = scalar<T>(0.5);
-        T v1 = scalar<T>(1.0);
-
-        for(dim_t idw = 0; idw < dims[3]; idw++) {
-            const dim_t inW = idw * inst[3];
-            const dim_t g0W = idw * g0st[3];
-            const dim_t g1W = idw * g1st[3];
-            for(dim_t idz = 0; idz < dims[2]; idz++) {
-                const dim_t inZW = inW + idz * inst[2];
-                const dim_t g0ZW = g0W + idz * g0st[2];
-                const dim_t g1ZW = g1W + idz * g1st[2];
-                dim_t xl, xr, yl,yr;
-                T f0, f1;
-                for(dim_t idy = 0; idy < dims[1]; idy++) {
-                    const dim_t inYZW = inZW + idy * inst[1];
-                    const dim_t g0YZW = g0ZW + idy * g0st[1];
-                    const dim_t g1YZW = g1ZW + idy * g1st[1];
-                    if(idy == 0) {
-                        yl = inYZW + inst[1];
-                        yr = inYZW;
-                        f1 = v1;
-                    } else if(idy == dims[1] - 1) {
-                        yl = inYZW;
-                        yr = inYZW - inst[1];
-                        f1 = v1;
-                    } else {
-                        yl = inYZW + inst[1];
-                        yr = inYZW - inst[1];
-                        f1 = v5;
-                    }
-                    for(dim_t idx = 0; idx < dims[0]; idx++) {
-                        const dim_t inMem = inYZW + idx;
-                        const dim_t g0Mem = g0YZW + idx;
-                        const dim_t g1Mem = g1YZW + idx;
-                        if(idx == 0) {
-                            xl = inMem + 1;
-                            xr = inMem;
-                            f0 = v1;
-                        } else if(idx == dims[0] - 1) {
-                            xl = inMem;
-                            xr = inMem - 1;
-                            f0 = v1;
-                        } else {
-                            xl = inMem + 1;
-                            xr = inMem - 1;
-                            f0 = v5;
-                        }
+template<typename T>
+void gradient(Array<T> &grad0, Array<T> &grad1, const Array<T> &in)
+{
+    grad0.eval();
+    grad1.eval();
+    in.eval();
 
-                        d_grad0[g0Mem] = f0 * (d_in[xl] - d_in[xr]);
-                        d_grad1[g1Mem] = f1 * (d_in[yl + idx] - d_in[yr + idx]);
-                    }
-                }
-            }
-        }
-    }
+    getQueue().enqueue(kernel::gradient<T>, grad0, grad1, in);
+}
 
 #define INSTANTIATE(T)                                                                  \
     template void gradient<T>(Array<T> &grad0, Array<T> &grad1, const Array<T> &in);    \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+
 }
diff --git a/src/backend/cpu/harris.cpp b/src/backend/cpu/harris.cpp
index d16c56a8b2..b5ea0ca20e 100644
--- a/src/backend/cpu/harris.cpp
+++ b/src/backend/cpu/harris.cpp
@@ -12,142 +12,28 @@
 #include <af/constants.h>
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
 #include <handle.hpp>
 #include <harris.hpp>
 #include <convolve.hpp>
 #include <gradient.hpp>
 #include <sort_index.hpp>
 #include <cstring>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/harris.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
-    if(!(sigma>0)) sigma = 0.25*dim;
-
-    T sum = (T)0;
-    for(int i=0;i<dim;i++)
-    {
-        int x = i-(dim-1)/2;
-        T el = 1. / sqrt(2 * af::Pi * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-        out[i] = el;
-        sum   += el;
-    }
-
-    for(int k=0;k<dim;k++)
-        out[k] /= sum;
-}
-
-template<typename T>
-void second_order_deriv(
-    T* ixx_out,
-    T* ixy_out,
-    T* iyy_out,
-    const unsigned in_len,
-    const T* ix_in,
-    const T* iy_in)
-{
-    for (unsigned x = 0; x < in_len; x++) {
-        ixx_out[x] = ix_in[x] * ix_in[x];
-        ixy_out[x] = ix_in[x] * iy_in[x];
-        iyy_out[x] = iy_in[x] * iy_in[x];
-    }
-}
-
-template<typename T>
-void harris_responses(
-    T* resp_out,
-    const unsigned idim0,
-    const unsigned idim1,
-    const T* ixx_in,
-    const T* ixy_in,
-    const T* iyy_in,
-    const float k_thr,
-    const unsigned border_len)
-{
-    const unsigned r = border_len;
-
-    for (unsigned x = r; x < idim1 - r; x++) {
-        for (unsigned y = r; y < idim0 - r; y++) {
-            const unsigned idx = x * idim0 + y;
-
-            // Calculates matrix trace and determinant
-            T tr = ixx_in[idx] + iyy_in[idx];
-            T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx];
-
-            // Calculates local Harris response
-            resp_out[idx] = det - k_thr * (tr*tr);
-        }
-    }
-}
-
-template<typename T>
-void non_maximal(
-    float* x_out,
-    float* y_out,
-    float* resp_out,
-    unsigned* count,
-    const unsigned idim0,
-    const unsigned idim1,
-    const T* resp_in,
-    const float min_resp,
-    const unsigned border_len,
-    const unsigned max_corners)
-{
-    // Responses on the border don't have 8-neighbors to compare, discard them
-    const unsigned r = border_len + 1;
-
-    for (unsigned x = r; x < idim1 - r; x++) {
-        for (unsigned y = r; y < idim0 - r; y++) {
-            const T v = resp_in[x * idim0 + y];
-
-            // Find maximum neighborhood response
-            T max_v;
-            max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]);
-            max_v = max(max_v, resp_in[(x-1) * idim0 + y  ]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y  ]);
-            max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]);
-            max_v = max(max_v, resp_in[(x)   * idim0 + y+1]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]);
-
-            // Stores corner to {x,y,resp}_out if it's response is maximum compared
-            // to its 8-neighborhood and greater or equal minimum response
-            if (v > max_v && v >= (T)min_resp) {
-                const unsigned idx = *count;
-                *count += 1;
-                if (idx < max_corners) {
-                    x_out[idx]    = (float)x;
-                    y_out[idx]    = (float)y;
-                    resp_out[idx] = (float)v;
-                }
-            }
-        }
-    }
-}
-
-static void keep_corners(float* x_out, float* y_out, float* resp_out,
-                         const float* x_in, const float* y_in, const float* resp_in,
-                         const unsigned* resp_idx, const unsigned n_corners)
-{
-    // Keep only the first n_feat features
-    for (unsigned f = 0; f < n_corners; f++) {
-        x_out[f] = x_in[resp_idx[f]];
-        y_out[f] = y_in[resp_idx[f]];
-        resp_out[f] = resp_in[f];
-    }
-}
-
 template<typename T, typename convAccT>
 unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
                 const Array<T> &in, const unsigned max_corners, const float min_response,
                 const float sigma, const unsigned filter_len, const float k_thr)
 {
+    in.eval();
+
     dim4 idims = in.dims();
 
     // Window filter
@@ -156,8 +42,7 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
     if (sigma < 0.5f) {
         for (unsigned i = 0; i < filter_len; i++)
             h_filter[i] = (T)1.f / (filter_len);
-    }
-    else {
+    } else {
         gaussian1D<convAccT>(h_filter, (int)filter_len, sigma);
     }
     Array<convAccT> filter = createDeviceDataArray<convAccT>(dim4(filter_len), (const void*)h_filter);
@@ -168,15 +53,14 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
     Array<T> iy = createEmptyArray<T>(idims);
 
     // Compute first order derivatives
-    gradient<T>(iy, ix, in);
+    getQueue().enqueue(gradient<T>, iy, ix, in);
 
     Array<T> ixx = createEmptyArray<T>(idims);
     Array<T> ixy = createEmptyArray<T>(idims);
     Array<T> iyy = createEmptyArray<T>(idims);
 
     // Compute second-order derivatives
-    second_order_deriv<T>(ixx.get(), ixy.get(), iyy.get(),
-                          in.elements(), ix.get(), iy.get());
+    getQueue().enqueue(kernel::second_order_deriv<T>, ixx, ixy, iyy, in.elements(), ix, iy);
 
     // Convolve second-order derivatives with proper window filter
     ixx = convolve2<T, convAccT, false>(ixx, filter, filter);
@@ -185,26 +69,22 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
 
     const unsigned corner_lim = in.elements() * 0.2f;
 
-    float* x_corners = memAlloc<float>(corner_lim);
-    float* y_corners = memAlloc<float>(corner_lim);
-    float* resp_corners = memAlloc<float>(corner_lim);
+    Array<T> responses = createEmptyArray<T>(dim4(in.elements()));
 
-    T* resp = memAlloc<T>(in.elements());
+    getQueue().enqueue(kernel::harris_responses<T>, responses, idims[0], idims[1],
+                       ixx, ixy, iyy, k_thr, border_len);
 
-    // Calculate Harris responses for all pixels
-    harris_responses<T>(resp,
-                        idims[0], idims[1],
-                        ixx.get(), ixy.get(), iyy.get(),
-                        k_thr, border_len);
+    Array<float> xCorners    = createEmptyArray<float>(dim4(corner_lim));
+    Array<float> yCorners    = createEmptyArray<float>(dim4(corner_lim));
+    Array<float> respCorners = createEmptyArray<float>(dim4(corner_lim));
 
     const unsigned min_r = (max_corners > 0) ? 0.f : min_response;
-    unsigned corners_found = 0;
 
     // Performs non-maximal suppression
-    non_maximal<T>(x_corners, y_corners, resp_corners, &corners_found,
-                   idims[0], idims[1], resp, min_r, border_len, corner_lim);
-
-    memFree(resp);
+    getQueue().sync();
+    unsigned corners_found = 0;
+    kernel::non_maximal<T>(xCorners, yCorners, respCorners, &corners_found,
+                   idims[0], idims[1], responses, min_r, border_len, corner_lim);
 
     const unsigned corners_out = (max_corners > 0) ?
                                  min(corners_found, max_corners) :
@@ -213,42 +93,42 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
         return 0;
 
     if (max_corners > 0 && corners_found > corners_out) {
-        Array<float> harris_responses = createDeviceDataArray<float>(dim4(corners_found), (void*)resp_corners);
+        respCorners.resetDims(dim4(corners_found));
         Array<float> harris_sorted = createEmptyArray<float>(dim4(corners_found));
         Array<unsigned> harris_idx = createEmptyArray<unsigned>(dim4(corners_found));
 
         // Sort Harris responses
-        sort_index<float, false>(harris_sorted, harris_idx, harris_responses, 0);
+        sort_index<float, false>(harris_sorted, harris_idx, respCorners, 0);
 
         x_out = createEmptyArray<float>(dim4(corners_out));
         y_out = createEmptyArray<float>(dim4(corners_out));
         resp_out = createEmptyArray<float>(dim4(corners_out));
 
         // Keep only the corners with higher Harris responses
-        keep_corners(x_out.get(), y_out.get(), resp_out.get(),
-                     x_corners, y_corners, harris_sorted.get(), harris_idx.get(),
-                     corners_out);
-
-        memFree(x_corners);
-        memFree(y_corners);
-    }
-    else if (max_corners == 0 && corners_found < corner_lim) {
+        getQueue().enqueue(kernel::keep_corners, x_out, y_out, resp_out, xCorners, yCorners,
+                           harris_sorted, harris_idx, corners_out);
+    } else if (max_corners == 0 && corners_found < corner_lim) {
         x_out = createEmptyArray<float>(dim4(corners_out));
         y_out = createEmptyArray<float>(dim4(corners_out));
         resp_out = createEmptyArray<float>(dim4(corners_out));
 
-        memcpy(x_out.get(), x_corners, corners_out * sizeof(float));
-        memcpy(y_out.get(), y_corners, corners_out * sizeof(float));
-        memcpy(resp_out.get(), resp_corners, corners_out * sizeof(float));
-
-        memFree(x_corners);
-        memFree(y_corners);
-        memFree(resp_corners);
-    }
-    else {
-        x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)x_corners);
-        y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)y_corners);
-        resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)resp_corners);
+        auto copyFunc = [=](Array<float> x_out, Array<float> y_out,
+                            Array<float> outResponses, const Array<float> x_crnrs,
+                            const Array<float> y_crnrs, const Array<float> inResponses,
+                            const unsigned corners_out) {
+            memcpy(x_out.get(), x_crnrs.get(), corners_out * sizeof(float));
+            memcpy(y_out.get(), y_crnrs.get(), corners_out * sizeof(float));
+            memcpy(outResponses.get(), inResponses.get(), corners_out * sizeof(float));
+        };
+        getQueue().enqueue(copyFunc, x_out, y_out, resp_out,
+                           xCorners, yCorners, respCorners, corners_out);
+    } else {
+        x_out = xCorners;
+        y_out = yCorners;
+        resp_out = respCorners;
+        x_out.resetDims(dim4(corners_out));
+        y_out.resetDims(dim4(corners_out));
+        resp_out.resetDims(dim4(corners_out));
     }
 
     return corners_out;
diff --git a/src/backend/cpu/hist_graphics.cpp b/src/backend/cpu/hist_graphics.cpp
index 21d3fdf941..ad7d69067d 100644
--- a/src/backend/cpu/hist_graphics.cpp
+++ b/src/backend/cpu/hist_graphics.cpp
@@ -11,6 +11,8 @@
 
 #include <hist_graphics.hpp>
 #include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
@@ -18,6 +20,8 @@ namespace cpu
 template<typename T>
 void copy_histogram(const Array<T> &data, const fg::Histogram* hist)
 {
+    data.eval();
+    getQueue().sync();
     CheckGL("Begin copy_histogram");
 
     glBindBuffer(GL_ARRAY_BUFFER, hist->vbo());
diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp
index e382a0ee87..3c30402b47 100644
--- a/src/backend/cpu/histogram.cpp
+++ b/src/backend/cpu/histogram.cpp
@@ -12,6 +12,9 @@
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <histogram.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/histogram.hpp>
 
 using af::dim4;
 
@@ -19,33 +22,19 @@ namespace cpu
 {
 
 template<typename inType, typename outType, bool isLinear>
-Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const double &minval, const double &maxval)
+Array<outType> histogram(const Array<inType> &in,
+                         const unsigned &nbins,
+                         const double &minval, const double &maxval)
 {
-    float step = (maxval - minval)/(float)nbins;
+    in.eval();
 
     const dim4 inDims  = in.dims();
-    dim4 iStrides      = in.strides();
     dim4 outDims       = dim4(nbins,1,inDims[2],inDims[3]);
     Array<outType> out = createValueArray<outType>(outDims, outType(0));
-    dim4 oStrides      = out.strides();
-    dim_t nElems    = inDims[0]*inDims[1];
+    out.eval();
 
-    outType *outData    = out.get();
-    const inType* inData= in.get();
-
-    for(dim_t b3 = 0; b3 < outDims[3]; b3++) {
-        for(dim_t b2 = 0; b2 < outDims[2]; b2++) {
-            for(dim_t i=0; i<nElems; i++) {
-                int idx = isLinear ? i : ((i % inDims[0]) + (i / inDims[0])*iStrides[1]);
-                int bin = (int)((inData[idx] - minval) / step);
-                bin = std::max(bin, 0);
-                bin = std::min(bin, (int)(nbins - 1));
-                outData[bin]++;
-            }
-            inData  += iStrides[2];
-            outData += oStrides[2];
-        }
-    }
+    getQueue().enqueue(kernel::histogram<outType, inType, isLinear>,
+            out, in, nbins, minval, maxval);
 
     return out;
 }
diff --git a/src/backend/cpu/homography.cpp b/src/backend/cpu/homography.cpp
index d20f0ca00c..4d131cf695 100644
--- a/src/backend/cpu/homography.cpp
+++ b/src/backend/cpu/homography.cpp
@@ -15,13 +15,11 @@
 #include <handle.hpp>
 #include <homography.hpp>
 #include <arith.hpp>
-#include <ireduce.hpp>
 #include <random.hpp>
-#include <svd.hpp>
-#include <memory.hpp>
 #include <cstring>
-
 #include <cfloat>
+#include <platform.hpp>
+#include <queue.hpp>
 
 using af::dim4;
 
@@ -154,12 +152,9 @@ unsigned updateIterations(float inlier_ratio, unsigned iter)
 }
 
 template<typename T>
-int computeHomography(T* H_ptr,
-                      const float* rnd_ptr,
-                      const float* x_src_ptr,
-                      const float* y_src_ptr,
-                      const float* x_dst_ptr,
-                      const float* y_dst_ptr)
+int computeHomography(T* H_ptr, const float* rnd_ptr,
+                      const float* x_src_ptr, const float* y_src_ptr,
+                      const float* x_dst_ptr, const float* y_dst_ptr)
 {
     if ((unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[1] || (unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[2] ||
         (unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[3] || (unsigned)rnd_ptr[1] == (unsigned)rnd_ptr[2] ||
@@ -192,6 +187,8 @@ int computeHomography(T* H_ptr,
     float dst_scale = sqrt(2.0f) / sqrt(dst_var);
 
     Array<T> A = createValueArray<T>(af::dim4(9, 9), (T)0);
+    A.eval();
+    getQueue().sync();
     af::dim4 Adims = A.dims();
     T* A_ptr = A.get();
 
@@ -217,6 +214,8 @@ int computeHomography(T* H_ptr,
     }
 
     Array<T> V = createValueArray<T>(af::dim4(Adims[1], Adims[1]), (T)0);
+    V.eval();
+    getQueue().sync();
     JacobiSVD<T>(A.get(), V.get(), 9, 9);
 
     af::dim4 Vdims = V.dims();
@@ -262,6 +261,8 @@ int findBestHomography(Array<T> &bestH,
     const float* y_dst_ptr = y_dst.get();
 
     Array<T> H = createValueArray<T>(af::dim4(9, iterations), (T)0);
+    H.eval();
+    getQueue().sync();
 
     const af::dim4 rdims = rnd.dims();
     const af::dim4 Hdims = H.dims();
@@ -278,8 +279,7 @@ int findBestHomography(Array<T> &bestH,
         const unsigned ridx = rdims[0] * i;
         const float* rnd_ptr = rnd.get() + ridx;
 
-        if (computeHomography<T>(H_ptr, rnd_ptr, x_src_ptr, y_src_ptr,
-                                 x_dst_ptr, y_dst_ptr))
+        if (computeHomography<T>(H_ptr, rnd_ptr, x_src_ptr, y_src_ptr, x_dst_ptr, y_dst_ptr))
             continue;
 
         if (htype == AF_HOMOGRAPHY_RANSAC) {
@@ -320,7 +320,6 @@ int findBestHomography(Array<T> &bestH,
                 minMedian = median;
                 bestIdx = i;
             }
-
         }
     }
 
@@ -355,6 +354,11 @@ int homography(Array<T> &bestH,
                const float inlier_thr,
                const unsigned iterations)
 {
+    x_src.eval();
+    y_src.eval();
+    x_dst.eval();
+    y_dst.eval();
+
     const af::dim4 idims = x_src.dims();
     const unsigned nsamples = idims[0];
 
@@ -366,6 +370,8 @@ int homography(Array<T> &bestH,
     Array<float> frnd = randu<float>(rdims);
     Array<float> fctr = createValueArray<float>(rdims, (float)nsamples);
     Array<float> rnd = arithOp<float, af_mul_t>(frnd, fctr, rdims);
+    rnd.eval();
+    getQueue().sync();
 
     return findBestHomography<T>(bestH, x_src, y_src, x_dst, y_dst, rnd, iter, nsamples, inlier_thr, htype);
 }
diff --git a/src/backend/cpu/hsv_rgb.cpp b/src/backend/cpu/hsv_rgb.cpp
index 82f404fa95..404491766c 100644
--- a/src/backend/cpu/hsv_rgb.cpp
+++ b/src/backend/cpu/hsv_rgb.cpp
@@ -11,8 +11,9 @@
 #include <Array.hpp>
 #include <ArrayInfo.hpp>
 #include <hsv_rgb.hpp>
-#include <err_cpu.hpp>
-#include <cmath>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/hsv_rgb.hpp>
 
 using af::dim4;
 
@@ -22,54 +23,11 @@ namespace cpu
 template<typename T>
 Array<T> hsv2rgb(const Array<T>& in)
 {
-    const dim4 dims    = in.dims();
-    const dim4 strides = in.strides();
-    Array<T> out       = createEmptyArray<T>(dims);
-    dim_t obStride  = out.strides()[3];
-    dim_t coff      = strides[2];
-    dim_t bCount    = dims[3];
+    in.eval();
 
-    for(dim_t b=0; b<bCount; ++b) {
-        const T* src = in.get() + b * strides[3];
-        T* dst       = out.get() + b * obStride;
+    Array<T> out = createEmptyArray<T>(in.dims());
 
-        for(dim_t j=0; j<dims[1]; ++j) {
-            dim_t jOff = j*strides[1];
-            // j steps along 2nd dimension
-            for(dim_t i=0; i<dims[0]; ++i) {
-                // i steps along 1st dimension
-                dim_t hIdx = i*strides[0] + jOff;
-                dim_t sIdx = hIdx + coff;
-                dim_t vIdx = sIdx + coff;
-
-                T H = src[hIdx];
-                T S = src[sIdx];
-                T V = src[vIdx];
-
-                T R, G, B;
-                R = G = B = 0;
-
-                int   m = (int)(H * 6);
-                T f = H * 6 - m;
-                T p = V * (1 - S);
-                T q = V * (1 - f * S);
-                T t = V * (1 - (1 - f) * S);
-
-                switch (m % 6) {
-                    case 0: R = V, G = t, B = p; break;
-                    case 1: R = q, G = V, B = p; break;
-                    case 2: R = p, G = V, B = t; break;
-                    case 3: R = p, G = q, B = V; break;
-                    case 4: R = t, G = p, B = V; break;
-                    case 5: R = V, G = p, B = q; break;
-                }
-
-                dst[hIdx] = R;
-                dst[sIdx] = G;
-                dst[vIdx] = B;
-            }
-        }
-    }
+    getQueue().enqueue(kernel::hsv2rgb<T>, out, in);
 
     return out;
 }
@@ -77,53 +35,11 @@ Array<T> hsv2rgb(const Array<T>& in)
 template<typename T>
 Array<T> rgb2hsv(const Array<T>& in)
 {
-    const dim4 dims    = in.dims();
-    const dim4 strides = in.strides();
-    Array<T> out       = createEmptyArray<T>(dims);
-    dim4 oStrides      = out.strides();
-    dim_t bCount    = dims[3];
-
-    for(dim_t b=0; b<bCount; ++b) {
-        const T* src = in.get() + b * strides[3];
-        T* dst       = out.get() + b * oStrides[3];
-
-        for(dim_t j=0; j<dims[1]; ++j) {
-            // j steps along 2nd dimension
-            dim_t oj = j * oStrides[1];
-            dim_t ij = j * strides[1];
-
-            for(dim_t i=0; i<dims[0]; ++i) {
-                // i steps along 1st dimension
-                dim_t oIdx0 = i * oStrides[0] + oj;
-                dim_t oIdx1 = oIdx0 + oStrides[2];
-                dim_t oIdx2 = oIdx1 + oStrides[2];
-
-                dim_t iIdx0 = i * strides[0]  + ij;
-                dim_t iIdx1 = iIdx0 + strides[2];
-                dim_t iIdx2 = iIdx1 + strides[2];
-
-                T R = src[iIdx0];
-                T G = src[iIdx1];
-                T B = src[iIdx2];
-                T Cmax = std::max(std::max(R, G), B);
-                T Cmin = std::min(std::min(R, G), B);
-                T delta= Cmax-Cmin;
-
-                T H = 0;
+    in.eval();
 
-                if (Cmax!=Cmin) {
-                    if (Cmax==R) H = (G-B)/delta + (G<B ? 6 : 0);
-                    if (Cmax==G) H = (B-R)/delta + 2;
-                    if (Cmax==B) H = (R-G)/delta + 4;
-                    H = H / 6.0f;
-                }
+    Array<T> out = createEmptyArray<T>(in.dims());
 
-                dst[oIdx0] = H;
-                dst[oIdx1] = (Cmax==0.0f ? 0 : delta/Cmax);
-                dst[oIdx2] = Cmax;
-            }
-        }
-    }
+    getQueue().enqueue(kernel::rgb2hsv<T>, out, in);
 
     return out;
 }
diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp
index 2973ae4409..c5e11029fc 100644
--- a/src/backend/cpu/identity.cpp
+++ b/src/backend/cpu/identity.cpp
@@ -7,47 +7,40 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/array.h>
 #include <af/dim4.hpp>
-#include <af/defines.h>
 #include <Array.hpp>
 #include <identity.hpp>
-#include <math.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/identity.hpp>
 
 namespace cpu
 {
-    template<typename T>
-    Array<T> identity(const dim4& dims)
-    {
-        Array<T> out = createEmptyArray<T>(dims);
-        T *ptr = out.get();
-        const dim_t *out_dims  = out.dims().get();
-
-        for (dim_t k = 0; k < out_dims[2] * out_dims[3]; k++) {
-            for (dim_t j = 0; j < out_dims[1]; j++) {
-                for (dim_t i = 0; i < out_dims[0]; i++) {
-                    ptr[j * out_dims[0] + i]  = (i == j) ? scalar<T>(1) : scalar<T>(0);
-                }
-            }
-            ptr += out_dims[0] * out_dims[1];
-        }
-        return out;
-    }
+
+template<typename T>
+Array<T> identity(const dim4& dims)
+{
+    Array<T> out = createEmptyArray<T>(dims);
+
+    getQueue().enqueue(kernel::identity<T>, out);
+
+    return out;
+}
 
 #define INSTANTIATE_IDENTITY(T)                              \
     template Array<T>  identity<T>    (const af::dim4 &dims);
 
-    INSTANTIATE_IDENTITY(float)
-    INSTANTIATE_IDENTITY(double)
-    INSTANTIATE_IDENTITY(cfloat)
-    INSTANTIATE_IDENTITY(cdouble)
-    INSTANTIATE_IDENTITY(int)
-    INSTANTIATE_IDENTITY(uint)
-    INSTANTIATE_IDENTITY(intl)
-    INSTANTIATE_IDENTITY(uintl)
-    INSTANTIATE_IDENTITY(char)
-    INSTANTIATE_IDENTITY(uchar)
-    INSTANTIATE_IDENTITY(short)
-    INSTANTIATE_IDENTITY(ushort)
+INSTANTIATE_IDENTITY(float)
+INSTANTIATE_IDENTITY(double)
+INSTANTIATE_IDENTITY(cfloat)
+INSTANTIATE_IDENTITY(cdouble)
+INSTANTIATE_IDENTITY(int)
+INSTANTIATE_IDENTITY(uint)
+INSTANTIATE_IDENTITY(intl)
+INSTANTIATE_IDENTITY(uintl)
+INSTANTIATE_IDENTITY(char)
+INSTANTIATE_IDENTITY(uchar)
+INSTANTIATE_IDENTITY(short)
+INSTANTIATE_IDENTITY(ushort)
 
 }
diff --git a/src/backend/cpu/iir.cpp b/src/backend/cpu/iir.cpp
index 615da2238d..049212ad69 100644
--- a/src/backend/cpu/iir.cpp
+++ b/src/backend/cpu/iir.cpp
@@ -12,81 +12,49 @@
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <iir.hpp>
-#include <err_cpu.hpp>
-#include <math.hpp>
-#include <arith.hpp>
 #include <convolve.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/iir.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<typename T>
-    Array<T> iir(const Array<T> &b, const Array<T> &a, const Array<T> &x)
-    {
-        T h_a0 = a.get()[0];
-        Array<T> a0 = createValueArray<T>(b.dims(), h_a0);
 
-        ConvolveBatchKind type = x.ndims() == 1 ? CONVOLVE_BATCH_NONE : CONVOLVE_BATCH_SAME;
-        if (x.ndims() != b.ndims()) {
-            type = (x.ndims() < b.ndims()) ? CONVOLVE_BATCH_KERNEL : CONVOLVE_BATCH_SIGNAL;
-        }
-
-        // Extract the first N elements
-        Array<T> c = convolve<T, T, 1, true>(x, b, type);
-        dim4 cdims = c.dims();
-        cdims[0] = x.dims()[0];
-        c.resetDims(cdims);
-
-        int num_a = a.dims()[0];
-
-        dim4 ydims = c.dims();
-        Array<T> y = createEmptyArray<T>(ydims);
-
-        for (int l = 0; l < (int)ydims[3]; l++) {
-            dim_t yidx3 = l * y.strides()[3];
-            dim_t cidx3 = l * c.strides()[3];
-            dim_t aidx3 = l * a.strides()[3];
-
-            for (int k = 0; k < (int)ydims[2]; k++) {
-
-                dim_t yidx2 = k * y.strides()[2] + yidx3;
-                dim_t cidx2 = k * c.strides()[2] + cidx3;
-                dim_t aidx2 = k * a.strides()[2] + aidx3;
-
-                for (int j = 0; j < (int)ydims[1]; j++) {
-
-                    dim_t yidx1 = j * y.strides()[1] + yidx2;
-                    dim_t cidx1 = j * c.strides()[1] + cidx2;
-                    dim_t aidx1 = j * a.strides()[1] + aidx2;
+template<typename T>
+Array<T> iir(const Array<T> &b, const Array<T> &a, const Array<T> &x)
+{
+    b.eval();
+    a.eval();
+    x.eval();
 
-                    std::vector<T> h_z(num_a);
+    ConvolveBatchKind type = x.ndims() == 1 ? CONVOLVE_BATCH_NONE : CONVOLVE_BATCH_SAME;
+    if (x.ndims() != b.ndims()) {
+        type = (x.ndims() < b.ndims()) ? CONVOLVE_BATCH_KERNEL : CONVOLVE_BATCH_SIGNAL;
+    }
 
-                    const T *h_a = a.get() + (a.ndims() > 1 ? aidx1 : 0);
-                    T *h_c = c.get() + cidx1;
-                    T *h_y = y.get() + yidx1;
+    // Extract the first N elements
+    Array<T> c = convolve<T, T, 1, true>(x, b, type);
+    dim4 cdims = c.dims();
+    cdims[0] = x.dims()[0];
+    c.resetDims(cdims);
 
-                    for (int i = 0; i < (int)ydims[0]; i++) {
+    Array<T> y = createEmptyArray<T>(c.dims());
 
-                        T y = h_y[i] = (h_c[i] + h_z[0]) /  h_a[0];
-                        for (int ii = 1; ii < num_a; ii++) {
-                            h_z[ii - 1] = h_z[ii] - h_a[ii] * y;
-                        }
-                    }
-                }
-            }
-        }
+    getQueue().enqueue(kernel::iir<T>, y, c, a);
 
-        return y;
-    }
+    return y;
+}
 
 #define INSTANTIATE(T)                          \
     template Array<T> iir(const Array<T> &b,    \
                           const Array<T> &a,    \
                           const Array<T> &x);   \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+
 }
diff --git a/src/backend/cpu/image.cpp b/src/backend/cpu/image.cpp
index 947afa2351..b71ba23c12 100644
--- a/src/backend/cpu/image.cpp
+++ b/src/backend/cpu/image.cpp
@@ -15,39 +15,43 @@
 #include <Array.hpp>
 #include <image.hpp>
 #include <err_cpu.hpp>
-#include <cstdio>
-#include <stdexcept>
 #include <graphics_common.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<typename T>
-    void copy_image(const Array<T> &in, const fg::Image* image)
-    {
-        CheckGL("Before CopyArrayToPBO");
-        const T *d_X = in.get();
-        size_t data_size = image->size();
-
-        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, image->pbo());
-        glBufferSubData(GL_PIXEL_UNPACK_BUFFER, 0, data_size, d_X);
-        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-        CheckGL("In CopyArrayToPBO");
-    }
-
-    #define INSTANTIATE(T)  \
-        template void copy_image<T>(const Array<T> &in, const fg::Image* image);
-
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(ushort)
-    INSTANTIATE(short)
+
+template<typename T>
+void copy_image(const Array<T> &in, const fg::Image* image)
+{
+    in.eval();
+    getQueue().sync();
+    CheckGL("Before CopyArrayToPBO");
+    const T *d_X = in.get();
+    size_t data_size = image->size();
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, image->pbo());
+    glBufferSubData(GL_PIXEL_UNPACK_BUFFER, 0, data_size, d_X);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    CheckGL("In CopyArrayToPBO");
+}
+
+#define INSTANTIATE(T)  \
+    template void copy_image<T>(const Array<T> &in, const fg::Image* image);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(ushort)
+INSTANTIATE(short)
+
 }
 
 #endif  // WITH_GRAPHICS
diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp
index e6d3daba4e..a2cdac888f 100644
--- a/src/backend/cpu/index.cpp
+++ b/src/backend/cpu/index.cpp
@@ -13,98 +13,52 @@
 #include <Array.hpp>
 #include <index.hpp>
 #include <handle.hpp>
-#include <err_cpu.hpp>
 #include <vector>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <utility>
+#include <kernel/index.hpp>
 
+using std::vector;
 using af::dim4;
 
 namespace cpu
 {
 
-static inline
-dim_t trimIndex(dim_t idx, const dim_t &len)
-{
-    dim_t ret_val = idx;
-    dim_t offset  = abs(ret_val)%len;
-    if (ret_val<0) {
-        ret_val = offset-1;
-    } else if (ret_val>=len) {
-        ret_val = len-offset-1;
-    }
-    return ret_val;
-}
-
 template<typename T>
 Array<T> index(const Array<T>& in, const af_index_t idxrs[])
 {
-    bool isSeq[4];
-    std::vector<af_seq> seqs(4, af_span);
+    in.eval();
+
+    vector<bool> isSeq(4);
+    vector<af_seq> seqs(4, af_span);
     // create seq vector to retrieve output
     // dimensions, offsets & offsets
-    for (dim_t x=0; x<4; ++x) {
+    for (unsigned x=0; x<isSeq.size(); ++x) {
         if (idxrs[x].isSeq) {
             seqs[x] = idxrs[x].idx.seq;
         }
         isSeq[x] = idxrs[x].isSeq;
     }
 
-    // rettrieve
-    dim4 iDims = in.dims();
-    dim4 dDims = in.getDataDims();
-    dim4 oDims = toDims  (seqs, iDims);
-    dim4 iOffs = toOffset(seqs, dDims);
-    dim4 iStrds= toStride(seqs, dDims);
+    // retrieve
+    dim4 oDims = toDims(seqs, in.dims());
 
-    std::vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4()));
+    vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4()));
     // look through indexs to read af_array indexs
-    for (dim_t x=0; x<4; ++x) {
+    for (unsigned x=0; x<isSeq.size(); ++x) {
         if (!isSeq[x]) {
             idxArrs[x] = castArray<uint>(idxrs[x].idx.arr);
+            idxArrs[x].eval();
             // set output array ith dimension value
             oDims[x] = idxArrs[x].elements();
         }
     }
 
     Array<T> out = createEmptyArray<T>(oDims);
-    dim4 oStrides= out.strides();
-
-    const T *src = in.get();
-    T *dst = out.get();
-
-    const uint* ptr0 = idxArrs[0].get();
-    const uint* ptr1 = idxArrs[1].get();
-    const uint* ptr2 = idxArrs[2].get();
-    const uint* ptr3 = idxArrs[3].get();
-
-    for (dim_t l=0; l<oDims[3]; ++l) {
 
-        dim_t lOff   = l*oStrides[3];
-        dim_t inIdx3 = trimIndex(isSeq[3] ? l+iOffs[3] : ptr3[l], iDims[3]);
-        dim_t inOff3 = inIdx3*iStrds[3];
 
-        for (dim_t k=0; k<oDims[2]; ++k) {
-
-            dim_t kOff   = k*oStrides[2];
-            dim_t inIdx2 = trimIndex(isSeq[2] ? k+iOffs[2] : ptr2[k], iDims[2]);
-            dim_t inOff2 = inIdx2*iStrds[2];
-
-            for (dim_t j=0; j<oDims[1]; ++j) {
-
-                dim_t jOff   = j*oStrides[1];
-                dim_t inIdx1 = trimIndex(isSeq[1] ? j+iOffs[1] : ptr1[j], iDims[1]);
-                dim_t inOff1 = inIdx1*iStrds[1];
-
-                for (dim_t i=0; i<oDims[0]; ++i) {
-
-                    dim_t iOff   = i*oStrides[0];
-                    dim_t inIdx0 = trimIndex(isSeq[0] ? i+iOffs[0] : ptr0[i], iDims[0]);
-                    dim_t inOff0 = inIdx0*iStrds[0];
-
-                    dst[lOff+kOff+jOff+iOff] = src[inOff3+inOff2+inOff1+inOff0];
-                }
-            }
-        }
-    }
+    getQueue().enqueue(kernel::index<T>, out, in, std::move(isSeq), std::move(seqs), std::move(idxArrs));
 
     return out;
 }
diff --git a/src/backend/cpu/inverse.cpp b/src/backend/cpu/inverse.cpp
index 129823b963..ea7d7ee828 100644
--- a/src/backend/cpu/inverse.cpp
+++ b/src/backend/cpu/inverse.cpp
@@ -23,6 +23,8 @@
 #include <lu.hpp>
 #include <identity.hpp>
 #include <solve.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
@@ -48,6 +50,7 @@ INV_FUNC(getri , cdouble, z)
 template<typename T>
 Array<T> inverse(const Array<T> &in)
 {
+    in.eval();
 
     int M = in.dims()[0];
     int N = in.dims()[1];
@@ -58,12 +61,14 @@ Array<T> inverse(const Array<T> &in)
     }
 
     Array<T> A = copyArray<T>(in);
-
     Array<int> pivot = lu_inplace<T>(A, false);
 
-    getri_func<T>()(AF_LAPACK_COL_MAJOR, M,
-                    A.get(), A.strides()[1],
-                    pivot.get());
+    auto func = [=] (Array<T> A, Array<int> pivot, int M) {
+        getri_func<T>()(AF_LAPACK_COL_MAJOR, M,
+                A.get(), A.strides()[1],
+                pivot.get());
+    };
+    getQueue().enqueue(func, A, pivot, M);
 
     return A;
 }
diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp
index 47bcb924e4..db19708b46 100644
--- a/src/backend/cpu/iota.cpp
+++ b/src/backend/cpu/iota.cpp
@@ -10,63 +10,38 @@
 #include <Array.hpp>
 #include <iota.hpp>
 #include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <algorithm>
-#include <numeric>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/iota.hpp>
 
 using namespace std;
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Kernel Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T>
-    void iota(T *out, const dim4 &dims, const dim4 &strides, const dim4 &sdims, const dim4 &tdims)
-    {
-        for(dim_t w = 0; w < dims[3]; w++) {
-            dim_t offW = w * strides[3];
-            T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2];
-            for(dim_t z = 0; z < dims[2]; z++) {
-                dim_t offWZ = offW + z * strides[2];
-                T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1];
-                for(dim_t y = 0; y < dims[1]; y++) {
-                    dim_t offWZY = offWZ + y * strides[1];
-                    T valY = valZ + (y % sdims[1]) * sdims[0];
-                    for(dim_t x = 0; x < dims[0]; x++) {
-                        dim_t id = offWZY + x;
-                        out[id] = valY + (x % sdims[0]);
-                    }
-                }
-            }
-        }
-    }
 
-    ///////////////////////////////////////////////////////////////////////////
-    // Wrapper Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T>
-    Array<T> iota(const dim4 &dims, const dim4 &tile_dims)
-    {
-        dim4 outdims = dims * tile_dims;
+template<typename T>
+Array<T> iota(const dim4 &dims, const dim4 &tile_dims)
+{
+    dim4 outdims = dims * tile_dims;
+
+    Array<T> out = createEmptyArray<T>(outdims);
 
-        Array<T> out = createEmptyArray<T>(outdims);
-        iota<T>(out.get(), out.dims(), out.strides(), dims, tile_dims);
+    getQueue().enqueue(kernel::iota<T>, out, dims, tile_dims);
 
-        return out;
-    }
+    return out;
+}
 
 #define INSTANTIATE(T)                                                          \
     template Array<T> iota<T>(const af::dim4 &dims, const af::dim4 &tile_dims); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp
index 2928af9620..a40fbdf958 100644
--- a/src/backend/cpu/ireduce.cpp
+++ b/src/backend/cpu/ireduce.cpp
@@ -13,192 +13,100 @@
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <ireduce.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/ireduce.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<typename T> double cabs(const T in) { return (double)in; }
-    static double cabs(const char in) { return (double)(in > 0); }
-    static double cabs(const cfloat &in) { return (double)abs(in); }
-    static double cabs(const cdouble &in) { return (double)abs(in); }
-
-    template<af_op_t op, typename T>
-    struct MinMaxOp
-    {
-        T m_val;
-        uint m_idx;
-        MinMaxOp(T val, uint idx) :
-            m_val(val), m_idx(idx)
-        {
-        }
 
-        void operator()(T val, uint idx)
-        {
-            if (cabs(val) < cabs(m_val) ||
-                (cabs(val) == cabs(m_val) &&
-                 idx > m_idx)) {
-                m_val = val;
-                m_idx = idx;
-            }
-        }
-    };
-
-    template<typename T>
-    struct MinMaxOp<af_max_t, T>
-    {
-        T m_val;
-        uint m_idx;
-        MinMaxOp(T val, uint idx) :
-            m_val(val), m_idx(idx)
-        {
-        }
+template<af_op_t op, typename T>
+using ireduce_dim_func = std::function<void(Array<T>, Array<uint>, const dim_t,
+                                            const Array<T>, const dim_t, const int)>;
 
-        void operator()(T val, uint idx)
-        {
-            if (cabs(val) > cabs(m_val) ||
-                (cabs(val) == cabs(m_val) &&
-                 idx <= m_idx)) {
-                m_val = val;
-                m_idx = idx;
-            }
-        }
-    };
-
-    template<af_op_t op, typename T, int D>
-    struct ireduce_dim
-    {
-        void operator()(T *out, const dim4 ostrides, const dim4 odims,
-                        uint *loc,
-                        const T *in , const dim4 istrides, const dim4 idims,
-                        const int dim)
-        {
-            const int D1 = D - 1;
-            for (dim_t i = 0; i < odims[D1]; i++) {
-                ireduce_dim<op, T, D1>()(out + i * ostrides[D1],
-                                         ostrides, odims,
-                                         loc + i * ostrides[D1],
-                                         in  + i * istrides[D1],
-                                         istrides, idims,
-                                         dim);
-            }
-        }
-    };
-
-    template<af_op_t op, typename T>
-    struct ireduce_dim<op, T, 0>
-    {
-        void operator()(T *out, const dim4 ostrides, const dim4 odims,
-                        uint *loc,
-                        const T *in , const dim4 istrides, const dim4 idims,
-                        const int dim)
-        {
-
-            dim_t stride = istrides[dim];
-            MinMaxOp<op, T> Op(in[0], 0);
-            for (dim_t i = 0; i < idims[dim]; i++) {
-                Op(in[i * stride], i);
-            }
+template<af_op_t op, typename T>
+void ireduce(Array<T> &out, Array<uint> &loc, const Array<T> &in, const int dim)
+{
+    out.eval();
+    loc.eval();
+    in.eval();
+
+    dim4 odims = in.dims();
+    odims[dim] = 1;
+    static const ireduce_dim_func<op, T> ireduce_funcs[] = { kernel::ireduce_dim<op, T, 1>()
+                                                           , kernel::ireduce_dim<op, T, 2>()
+                                                           , kernel::ireduce_dim<op, T, 3>()
+                                                           , kernel::ireduce_dim<op, T, 4>()};
+
+    getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim);
+}
 
-            *out = Op.m_val;
-            *loc = Op.m_idx;
-        }
-    };
-
-    template<af_op_t op, typename T>
-    void ireduce(Array<T> &out, Array<uint> &loc,
-                 const Array<T> &in, const int dim)
-    {
-        dim4 odims = in.dims();
-        odims[dim] = 1;
-
-        switch (in.ndims()) {
-        case 1:
-            ireduce_dim<op, T, 1>()(out.get(), out.strides(), out.dims(),
-                                    loc.get(),
-                                    in.get(), in.strides(), in.dims(), dim);
-            break;
-
-        case 2:
-            ireduce_dim<op, T, 2>()(out.get(), out.strides(), out.dims(),
-                                    loc.get(),
-                                    in.get(), in.strides(), in.dims(), dim);
-            break;
-
-        case 3:
-            ireduce_dim<op, T, 3>()(out.get(), out.strides(), out.dims(),
-                                    loc.get(),
-                                    in.get(), in.strides(), in.dims(), dim);
-            break;
-
-        case 4:
-            ireduce_dim<op, T, 4>()(out.get(), out.strides(), out.dims(),
-                                    loc.get(),
-                                    in.get(), in.strides(), in.dims(), dim);
-            break;
-        }
-    }
+template<af_op_t op, typename T>
+T ireduce_all(unsigned *loc, const Array<T> &in)
+{
+    in.eval();
+    getQueue().sync();
 
-    template<af_op_t op, typename T>
-    T ireduce_all(unsigned *loc, const Array<T> &in)
-    {
-        af::dim4 dims = in.dims();
-        af::dim4 strides = in.strides();
-        const T *inPtr = in.get();
+    af::dim4 dims = in.dims();
+    af::dim4 strides = in.strides();
+    const T *inPtr = in.get();
 
-        MinMaxOp<op, T> Op(inPtr[0], 0);
+    kernel::MinMaxOp<op, T> Op(inPtr[0], 0);
 
-        for(dim_t l = 0; l < dims[3]; l++) {
-            dim_t off3 = l * strides[3];
+    for(dim_t l = 0; l < dims[3]; l++) {
+        dim_t off3 = l * strides[3];
 
-            for(dim_t k = 0; k < dims[2]; k++) {
-                dim_t off2 = k * strides[2];
+        for(dim_t k = 0; k < dims[2]; k++) {
+            dim_t off2 = k * strides[2];
 
-                for(dim_t j = 0; j < dims[1]; j++) {
-                    dim_t off1 = j * strides[1];
+            for(dim_t j = 0; j < dims[1]; j++) {
+                dim_t off1 = j * strides[1];
 
-                    for(dim_t i = 0; i < dims[0]; i++) {
-                        dim_t idx = i + off1 + off2 + off3;
-                        Op(inPtr[idx], idx);
-                    }
+                for(dim_t i = 0; i < dims[0]; i++) {
+                    dim_t idx = i + off1 + off2 + off3;
+                    Op(inPtr[idx], idx);
                 }
             }
         }
-
-        *loc = Op.m_idx;
-        return Op.m_val;
     }
 
+    *loc = Op.m_idx;
+    return Op.m_val;
+}
+
 #define INSTANTIATE(ROp, T)                                             \
     template void ireduce<ROp, T>(Array<T> &out, Array<uint> &loc,      \
                                   const Array<T> &in, const int dim);   \
     template T ireduce_all<ROp, T>(unsigned *loc, const Array<T> &in);  \
 
-    //min
-    INSTANTIATE(af_min_t, float  )
-    INSTANTIATE(af_min_t, double )
-    INSTANTIATE(af_min_t, cfloat )
-    INSTANTIATE(af_min_t, cdouble)
-    INSTANTIATE(af_min_t, int    )
-    INSTANTIATE(af_min_t, uint   )
-    INSTANTIATE(af_min_t, intl   )
-    INSTANTIATE(af_min_t, uintl  )
-    INSTANTIATE(af_min_t, char   )
-    INSTANTIATE(af_min_t, uchar  )
-    INSTANTIATE(af_min_t, short  )
-    INSTANTIATE(af_min_t, ushort )
-
-    //max
-    INSTANTIATE(af_max_t, float  )
-    INSTANTIATE(af_max_t, double )
-    INSTANTIATE(af_max_t, cfloat )
-    INSTANTIATE(af_max_t, cdouble)
-    INSTANTIATE(af_max_t, int    )
-    INSTANTIATE(af_max_t, uint   )
-    INSTANTIATE(af_max_t, intl   )
-    INSTANTIATE(af_max_t, uintl  )
-    INSTANTIATE(af_max_t, char   )
-    INSTANTIATE(af_max_t, uchar  )
-    INSTANTIATE(af_max_t, short  )
-    INSTANTIATE(af_max_t, ushort )
+//min
+INSTANTIATE(af_min_t, float  )
+INSTANTIATE(af_min_t, double )
+INSTANTIATE(af_min_t, cfloat )
+INSTANTIATE(af_min_t, cdouble)
+INSTANTIATE(af_min_t, int    )
+INSTANTIATE(af_min_t, uint   )
+INSTANTIATE(af_min_t, intl   )
+INSTANTIATE(af_min_t, uintl  )
+INSTANTIATE(af_min_t, char   )
+INSTANTIATE(af_min_t, uchar  )
+INSTANTIATE(af_min_t, short  )
+INSTANTIATE(af_min_t, ushort )
+
+//max
+INSTANTIATE(af_max_t, float  )
+INSTANTIATE(af_max_t, double )
+INSTANTIATE(af_max_t, cfloat )
+INSTANTIATE(af_max_t, cdouble)
+INSTANTIATE(af_max_t, int    )
+INSTANTIATE(af_max_t, uint   )
+INSTANTIATE(af_max_t, intl   )
+INSTANTIATE(af_max_t, uintl  )
+INSTANTIATE(af_max_t, char   )
+INSTANTIATE(af_max_t, uchar  )
+INSTANTIATE(af_max_t, short  )
+INSTANTIATE(af_max_t, ushort )
+
 }
diff --git a/src/backend/cpu/join.cpp b/src/backend/cpu/join.cpp
index 78d2a51ab4..0a5b99cd13 100644
--- a/src/backend/cpu/join.cpp
+++ b/src/backend/cpu/join.cpp
@@ -9,243 +9,136 @@
 
 #include <Array.hpp>
 #include <join.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/join.hpp>
 
 namespace cpu
 {
-    template<typename To, typename Tx, int dim>
-    void join_append(To *out, const Tx *X, const af::dim4 &offset,
-               const af::dim4 &odims, const af::dim4 &xdims,
-               const af::dim4 &ost, const af::dim4 &xst)
-    {
-        for(dim_t ow = 0; ow < xdims[3]; ow++) {
-            const dim_t xW = ow * xst[3];
-            const dim_t oW = (ow + offset[3]) * ost[3];
-
-            for(dim_t oz = 0; oz < xdims[2]; oz++) {
-                const dim_t xZW = xW + oz * xst[2];
-                const dim_t oZW = oW + (oz + offset[2]) * ost[2];
-
-                for(dim_t oy = 0; oy < xdims[1]; oy++) {
-                    const dim_t xYZW = xZW + oy * xst[1];
-                    const dim_t oYZW = oZW + (oy + offset[1]) * ost[1];
-
-                    for(dim_t ox = 0; ox < xdims[0]; ox++) {
-                        const dim_t iMem = xYZW + ox;
-                        const dim_t oMem = oYZW + (ox + offset[0]);
-                        out[oMem] = X[iMem];
-                    }
-                }
-            }
+
+template<typename Tx, typename Ty>
+Array<Tx> join(const int dim, const Array<Tx> &first, const Array<Ty> &second)
+{
+    first.eval();
+    second.eval();
+
+    // All dimensions except join dimension must be equal
+    // Compute output dims
+    af::dim4 odims;
+    af::dim4 fdims = first.dims();
+    af::dim4 sdims = second.dims();
+
+    for(int i = 0; i < 4; i++) {
+        if(i == dim) {
+            odims[i] = fdims[i] + sdims[i];
+        } else {
+            odims[i] = fdims[i];
         }
     }
 
-    template<int dim>
-    af::dim4 calcOffset(const af::dim4 dims)
-    {
-        af::dim4 offset;
-        offset[0] = (dim == 0) ? dims[0] : 0;
-        offset[1] = (dim == 1) ? dims[1] : 0;
-        offset[2] = (dim == 2) ? dims[2] : 0;
-        offset[3] = (dim == 3) ? dims[3] : 0;
-        return offset;
-    }
+    Array<Tx> out = createEmptyArray<Tx>(odims);
 
-    template<typename Tx, typename Ty>
-    Array<Tx> join(const int dim, const Array<Tx> &first, const Array<Ty> &second)
-    {
-        // All dimensions except join dimension must be equal
-        // Compute output dims
-        af::dim4 odims;
-        af::dim4 fdims = first.dims();
-        af::dim4 sdims = second.dims();
-
-        for(int i = 0; i < 4; i++) {
-            if(i == dim) {
-                odims[i] = fdims[i] + sdims[i];
-            } else {
-                odims[i] = fdims[i];
-            }
-        }
+    getQueue().enqueue(kernel::join<Tx, Ty>, out, dim, first, second);
 
-        Array<Tx> out = createEmptyArray<Tx>(odims);
-
-        Tx* outPtr = out.get();
-        const Tx* fptr = first.get();
-        const Ty* sptr = second.get();
-
-        af::dim4 zero(0,0,0,0);
-
-        switch(dim) {
-            case 0:
-                join_append<Tx, Tx, 0>(outPtr, fptr, zero,
-                                       odims, fdims, out.strides(), first.strides());
-                join_append<Tx, Ty, 0>(outPtr, sptr, calcOffset<0>(fdims),
-                                       odims, sdims, out.strides(), second.strides());
-                break;
-            case 1:
-                join_append<Tx, Tx, 1>(outPtr, fptr, zero,
-                                       odims, fdims, out.strides(), first.strides());
-                join_append<Tx, Ty, 1>(outPtr, sptr, calcOffset<1>(fdims),
-                                       odims, sdims, out.strides(), second.strides());
-                break;
-            case 2:
-                join_append<Tx, Tx, 2>(outPtr, fptr, zero,
-                                       odims, fdims, out.strides(), first.strides());
-                join_append<Tx, Ty, 2>(outPtr, sptr, calcOffset<2>(fdims),
-                                       odims, sdims, out.strides(), second.strides());
-                break;
-            case 3:
-                join_append<Tx, Tx, 3>(outPtr, fptr, zero,
-                                       odims, fdims, out.strides(), first.strides());
-                join_append<Tx, Ty, 3>(outPtr, sptr, calcOffset<3>(fdims),
-                                       odims, sdims, out.strides(), second.strides());
-                break;
-        }
+    return out;
+}
 
-        return out;
+template<typename T>
+Array<T> join(const int dim, const std::vector<Array<T>> &inputs)
+{
+    for (unsigned i=0; i<inputs.size(); ++i)
+        inputs[i].eval();
+    // All dimensions except join dimension must be equal
+    // Compute output dims
+    af::dim4 odims;
+    const dim_t n_arrays = inputs.size();
+    std::vector<af::dim4> idims(n_arrays);
+
+    dim_t dim_size = 0;
+    for(unsigned i = 0; i < idims.size(); i++) {
+        idims[i] = inputs[i].dims();
+        dim_size += idims[i][dim];
     }
 
-    template<typename T, int n_arrays>
-    void join_wrapper(const int dim, Array<T> &out, const std::vector<Array<T>> &inputs)
-    {
-        af::dim4 zero(0,0,0,0);
-        af::dim4 d = zero;
-        switch(dim) {
-            case 0:
-                join_append<T, T, 0>(out.get(), inputs[0].get(), zero,
-                            out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
-                for(int i = 1; i < n_arrays; i++) {
-                    d += inputs[i - 1].dims();
-                    join_append<T, T, 0>(out.get(), inputs[i].get(), calcOffset<0>(d),
-                            out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
-                }
-                break;
-            case 1:
-                join_append<T, T, 1>(out.get(), inputs[0].get(), zero,
-                            out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
-                for(int i = 1; i < n_arrays; i++) {
-                    d += inputs[i - 1].dims();
-                    join_append<T, T, 1>(out.get(), inputs[i].get(), calcOffset<1>(d),
-                            out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
-                }
-                break;
-            case 2:
-                join_append<T, T, 2>(out.get(), inputs[0].get(), zero,
-                            out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
-                for(int i = 1; i < n_arrays; i++) {
-                    d += inputs[i - 1].dims();
-                    join_append<T, T, 2>(out.get(), inputs[i].get(), calcOffset<2>(d),
-                            out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
-                }
-                break;
-            case 3:
-                join_append<T, T, 3>(out.get(), inputs[0].get(), zero,
-                            out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
-                for(int i = 1; i < n_arrays; i++) {
-                    d += inputs[i - 1].dims();
-                    join_append<T, T, 3>(out.get(), inputs[i].get(), calcOffset<3>(d),
-                            out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
-                }
-                break;
+    for(int i = 0; i < 4; i++) {
+        if(i == dim) {
+            odims[i] = dim_size;
+        } else {
+            odims[i] = idims[0][i];
         }
     }
 
-    template<typename T>
-    Array<T> join(const int dim, const std::vector<Array<T>> &inputs)
-    {
-        // All dimensions except join dimension must be equal
-        // Compute output dims
-        af::dim4 odims;
-        const dim_t n_arrays = inputs.size();
-        std::vector<af::dim4> idims(n_arrays);
-
-        dim_t dim_size = 0;
-        for(int i = 0; i < (int)idims.size(); i++) {
-            idims[i] = inputs[i].dims();
-            dim_size += idims[i][dim];
-        }
-
-        for(int i = 0; i < 4; i++) {
-            if(i == dim) {
-                odims[i] = dim_size;
-            } else {
-                odims[i] = idims[0][i];
-            }
-        }
-
-        Array<T> out = createEmptyArray<T>(odims);
-
-        switch(n_arrays) {
-            case 1:
-                join_wrapper<T, 1>(dim, out, inputs);
-                break;
-            case 2:
-                join_wrapper<T, 2>(dim, out, inputs);
-                break;
-            case 3:
-                join_wrapper<T, 3>(dim, out, inputs);
-                break;
-            case 4:
-                join_wrapper<T, 4>(dim, out, inputs);
-                break;
-            case 5:
-                join_wrapper<T, 5>(dim, out, inputs);
-                break;
-            case 6:
-                join_wrapper<T, 6>(dim, out, inputs);
-                break;
-            case 7:
-                join_wrapper<T, 7>(dim, out, inputs);
-                break;
-            case 8:
-                join_wrapper<T, 8>(dim, out, inputs);
-                break;
-            case 9:
-                join_wrapper<T, 9>(dim, out, inputs);
-                break;
-            case 10:
-                join_wrapper<T,10>(dim, out, inputs);
-                break;
-        }
-
-        return out;
+    Array<T> out = createEmptyArray<T>(odims);
+
+    switch(n_arrays) {
+        case 1:
+            getQueue().enqueue(kernel::join<T, 1>, dim, out, inputs);
+            break;
+        case 2:
+            getQueue().enqueue(kernel::join<T, 2>, dim, out, inputs);
+            break;
+        case 3:
+            getQueue().enqueue(kernel::join<T, 3>, dim, out, inputs);
+            break;
+        case 4:
+            getQueue().enqueue(kernel::join<T, 4>, dim, out, inputs);
+            break;
+        case 5:
+            getQueue().enqueue(kernel::join<T, 5>, dim, out, inputs);
+            break;
+        case 6:
+            getQueue().enqueue(kernel::join<T, 6>, dim, out, inputs);
+            break;
+        case 7:
+            getQueue().enqueue(kernel::join<T, 7>, dim, out, inputs);
+            break;
+        case 8:
+            getQueue().enqueue(kernel::join<T, 8>, dim, out, inputs);
+            break;
+        case 9:
+            getQueue().enqueue(kernel::join<T, 9>, dim, out, inputs);
+            break;
+        case 10:
+            getQueue().enqueue(kernel::join<T,10>, dim, out, inputs);
+            break;
     }
 
+    return out;
+}
+
 #define INSTANTIATE(Tx, Ty) \
     template Array<Tx> join<Tx, Ty>(const int dim, const Array<Tx> &first, const Array<Ty> &second);
 
-    INSTANTIATE(float,   float)
-    INSTANTIATE(double,  double)
-    INSTANTIATE(cfloat,  cfloat)
-    INSTANTIATE(cdouble, cdouble)
-    INSTANTIATE(int,     int)
-    INSTANTIATE(uint,    uint)
-    INSTANTIATE(intl,    intl)
-    INSTANTIATE(uintl,   uintl)
-    INSTANTIATE(uchar,   uchar)
-    INSTANTIATE(char,    char)
-    INSTANTIATE(ushort,  ushort)
-    INSTANTIATE(short,   short)
+INSTANTIATE(float,   float)
+INSTANTIATE(double,  double)
+INSTANTIATE(cfloat,  cfloat)
+INSTANTIATE(cdouble, cdouble)
+INSTANTIATE(int,     int)
+INSTANTIATE(uint,    uint)
+INSTANTIATE(intl,    intl)
+INSTANTIATE(uintl,   uintl)
+INSTANTIATE(uchar,   uchar)
+INSTANTIATE(char,    char)
+INSTANTIATE(ushort,  ushort)
+INSTANTIATE(short,   short)
 
 #undef INSTANTIATE
 
 #define INSTANTIATE(T)      \
     template Array<T> join<T>(const int dim, const std::vector<Array<T>> &inputs);
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(ushort)
-    INSTANTIATE(short)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(ushort)
+INSTANTIATE(short)
 
 #undef INSTANTIATE
 }
diff --git a/src/backend/cpu/kernel/Array.hpp b/src/backend/cpu/kernel/Array.hpp
new file mode 100644
index 0000000000..08ade502e5
--- /dev/null
+++ b/src/backend/cpu/kernel/Array.hpp
@@ -0,0 +1,58 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <platform.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void evalArray(Array<T> in)
+{
+    in.setId(cpu::getActiveDeviceId());
+    T *ptr = in.data.get();
+
+    af::dim4 odims = in.dims();
+    af::dim4 ostrs = in.strides();
+
+    bool is_linear = in.node->isLinear(odims.get());
+
+    if (is_linear) {
+        int num = in.elements();
+        for (int i = 0; i < num; i++) {
+            ptr[i] = *(T *)in.node->calc(i);
+        }
+    } else {
+        for (int w = 0; w < (int)odims[3]; w++) {
+            dim_t offw = w * ostrs[3];
+
+            for (int z = 0; z < (int)odims[2]; z++) {
+                dim_t offz = z * ostrs[2] + offw;
+
+                for (int y = 0; y < (int)odims[1]; y++) {
+                    dim_t offy = y * ostrs[1] + offz;
+
+                    for (int x = 0; x < (int)odims[0]; x++) {
+                        dim_t id = x + offy;
+
+                        ptr[id] = *(T *)in.node->calc(x, y, z, w);
+                    }
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/approx1.hpp b/src/backend/cpu/kernel/approx1.hpp
new file mode 100644
index 0000000000..ab12ebc813
--- /dev/null
+++ b/src/backend/cpu/kernel/approx1.hpp
@@ -0,0 +1,144 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename InT, typename LocT, af_interp_type Method>
+struct approx1_op
+{
+    void operator()(InT *out, af::dim4 const & odims, dim_t const oElems,
+              InT const * const in,  af::dim4 const & idims, dim_t const iElems,
+              LocT const * const pos, af::dim4 const & pdims,
+              af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides,
+              float const offGrid, bool const pBatch,
+              dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw)
+    {
+        return;
+    }
+};
+
+template<typename InT, typename LocT>
+struct approx1_op<InT, LocT, AF_INTERP_NEAREST>
+{
+    void operator()(InT *out, af::dim4 const & odims, dim_t const oElems,
+              InT const * const in,  af::dim4 const & idims, dim_t const iElems,
+              LocT const * const pos, af::dim4 const & pdims,
+              af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides,
+              float const offGrid, bool const pBatch,
+              dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw)
+    {
+        dim_t pmId = idx;
+        if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
+
+        LocT const x = pos[pmId];
+        bool gFlag = false;
+        if (x < 0 || idims[0] < x+1) {  // No need to check y
+            gFlag = true;
+        }
+
+        dim_t const omId = idw * ostrides[3] + idz * ostrides[2]
+                         + idy * ostrides[1] + idx;
+        if(gFlag) {
+            out[omId] = scalar<InT>(offGrid);
+        } else {
+            dim_t ioff = idw * istrides[3] + idz * istrides[2]
+                       + idy * istrides[1];
+            dim_t const iMem = round(x) + ioff;
+
+            out[omId] = in[iMem];
+        }
+    }
+};
+
+template<typename InT, typename LocT>
+struct approx1_op<InT, LocT, AF_INTERP_LINEAR>
+{
+    void operator()(InT *out, af::dim4 const & odims, dim_t const oElems,
+              InT const * const in,  af::dim4 const & idims, dim_t const iElems,
+              LocT const * const pos, af::dim4 const & pdims,
+              af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides,
+              float const offGrid, bool const pBatch,
+              dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw)
+    {
+        dim_t pmId = idx;
+        if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
+
+        LocT const x = pos[pmId];
+        bool gFlag = false;
+        if (x < 0 || idims[0] < x+1) {
+            gFlag = true;
+        }
+
+        dim_t const grid_x = floor(x);  // nearest grid
+        LocT const off_x = x - grid_x; // fractional offset
+
+        dim_t const omId = idw * ostrides[3] + idz * ostrides[2]
+                         + idy * ostrides[1] + idx;
+        if(gFlag) {
+            out[omId] = scalar<InT>(offGrid);
+        } else {
+            dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x;
+
+            // Check if x and x + 1 are both valid indices
+            bool cond = (x < idims[0] - 1);
+            // Compute Left and Right Weighted Values
+            InT yl = ((LocT)1.0 - off_x) * in[ioff];
+            InT yr = cond ? (off_x) * in[ioff + 1] : scalar<InT>(0);
+            InT yo = yl + yr;
+            // Compute Weight used
+            LocT wt = cond ? (LocT)1.0 : (LocT)(1.0 - off_x);
+            // Write final value
+            out[omId] = (yo / wt);
+        }
+    }
+};
+
+template<typename InT, typename LocT, af_interp_type Method>
+void approx1(Array<InT> output, Array<InT> const input,
+             Array<LocT> const position, float const offGrid)
+{
+    InT * out = output.get();
+    InT const * const in  = input.get();
+    LocT const * const pos = position.get();
+
+    af::dim4 const odims     = output.dims();
+    af::dim4 const idims     = input.dims();
+    af::dim4 const pdims     = position.dims();
+    af::dim4 const ostrides  = output.strides();
+    af::dim4 const istrides  = input.strides();
+    af::dim4 const pstrides  = position.strides();
+
+    dim_t const oElems = output.elements();
+    dim_t const iElems = input.elements();
+
+    approx1_op<InT, LocT, Method> op;
+    bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1);
+
+    for(dim_t w = 0; w < odims[3]; w++) {
+        for(dim_t z = 0; z < odims[2]; z++) {
+            for(dim_t y = 0; y < odims[1]; y++) {
+                for(dim_t x = 0; x < odims[0]; x++) {
+                    op(out, odims, oElems, in, idims, iElems, pos, pdims,
+                       ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w);
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/approx2.hpp b/src/backend/cpu/kernel/approx2.hpp
new file mode 100644
index 0000000000..b5115e2e49
--- /dev/null
+++ b/src/backend/cpu/kernel/approx2.hpp
@@ -0,0 +1,170 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename InT, typename LocT, af_interp_type Method>
+struct approx2_op
+{
+    void operator()(InT *out, af::dim4 const & odims, dim_t const oElems,
+              InT const * const in,  af::dim4 const & idims, dim_t const iElems,
+              LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims,
+              af::dim4 const & ostrides, af::dim4 const & istrides,
+              af::dim4 const & pstrides, af::dim4 const & qstrides,
+              float const offGrid, bool const pBatch,
+              dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw)
+    {
+        return;
+    }
+};
+
+template<typename InT, typename LocT>
+struct approx2_op<InT, LocT, AF_INTERP_NEAREST>
+{
+    void operator()(InT *out, af::dim4 const & odims, dim_t const oElems,
+              InT const * const in,  af::dim4 const & idims, dim_t const iElems,
+              LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims,
+              af::dim4 const & ostrides, af::dim4 const & istrides,
+              af::dim4 const & pstrides, af::dim4 const & qstrides,
+              float const offGrid, bool const pBatch,
+              dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw)
+    {
+        dim_t pmId = idy * pstrides[1] + idx;
+        dim_t qmId = idy * qstrides[1] + idx;
+        if(pBatch) {
+            pmId += idw * pstrides[3] + idz * pstrides[2];
+            qmId += idw * qstrides[3] + idz * qstrides[2];
+        }
+
+        bool gFlag = false;
+        LocT const x = pos[pmId], y = qos[qmId];
+        if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
+            gFlag = true;
+        }
+
+        dim_t const omId = idw * ostrides[3] + idz * ostrides[2]
+                         + idy * ostrides[1] + idx;
+        if(gFlag) {
+            out[omId] = scalar<InT>(offGrid);
+        } else {
+            dim_t const grid_x = round(x), grid_y = round(y); // nearest grid
+            dim_t const imId = idw * istrides[3] + idz * istrides[2] +
+                            grid_y * istrides[1] + grid_x;
+            out[omId] = in[imId];
+        }
+    }
+};
+
+template<typename InT, typename LocT>
+struct approx2_op<InT, LocT, AF_INTERP_LINEAR>
+{
+    void operator()(InT *out, af::dim4 const & odims, dim_t const oElems,
+              InT const * const in,  af::dim4 const & idims, dim_t const iElems,
+              LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims,
+              af::dim4 const & ostrides, af::dim4 const & istrides,
+              af::dim4 const & pstrides, af::dim4 const & qstrides,
+              float const offGrid, bool const pBatch,
+              dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw)
+    {
+        dim_t pmId = idy * pstrides[1] + idx;
+        dim_t qmId = idy * qstrides[1] + idx;
+        if(pBatch) {
+            pmId += idw * pstrides[3] + idz * pstrides[2];
+            qmId += idw * qstrides[3] + idz * qstrides[2];
+        }
+
+        bool gFlag = false;
+        LocT const x = pos[pmId], y = qos[qmId];
+        if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
+            gFlag = true;
+        }
+
+        dim_t const grid_x = floor(x),   grid_y = floor(y);   // nearest grid
+        LocT const off_x  = x - grid_x, off_y  = y - grid_y; // fractional offset
+
+        // Check if pVal and pVal + 1 are both valid indices
+        bool condY = (y < idims[1] - 1);
+        bool condX = (x < idims[0] - 1);
+
+        // Compute wieghts used
+        LocT wt00 = ((LocT)1.0 - off_x) * ((LocT)1.0 - off_y);
+        LocT wt10 = (condY) ? ((LocT)1.0 - off_x) * (off_y) : 0;
+        LocT wt01 = (condX) ? (off_x) * ((LocT)1.0 - off_y) : 0;
+        LocT wt11 = (condX && condY) ? (off_x) * (off_y)  : 0;
+
+        LocT wt = wt00 + wt10 + wt01 + wt11;
+        InT zero = scalar<InT>(0);
+
+        dim_t const omId = idw * ostrides[3] + idz * ostrides[2]
+                         + idy * ostrides[1] + idx;
+        if(gFlag) {
+            out[omId] = scalar<InT>(offGrid);
+        } else {
+            dim_t ioff = idw * istrides[3] + idz * istrides[2]
+                    + grid_y * istrides[1] + grid_x;
+
+            // Compute Weighted Values
+            InT y00 =                    wt00 * in[ioff];
+            InT y10 = (condY) ?          wt10 * in[ioff + istrides[1]]     : zero;
+            InT y01 = (condX) ?          wt01 * in[ioff + 1]               : zero;
+            InT y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero;
+
+            InT yo = y00 + y10 + y01 + y11;
+
+            // Write Final Value
+            out[omId] = (yo / wt);
+        }
+    }
+};
+
+template<typename InT, typename LocT, af_interp_type Method>
+void approx2(Array<InT> output, Array<InT> const input,
+             Array<LocT> const position, Array<LocT> const qosition,
+             float const offGrid)
+{
+    InT * out = output.get();
+    InT const * const in  = input.get();
+    LocT const * const pos = position.get();
+    LocT const * const qos = qosition.get();
+    af::dim4 const odims     = output.dims();
+    af::dim4 const idims     = input.dims();
+    af::dim4 const pdims     = position.dims();
+    af::dim4 const qdims     = qosition.dims();
+    af::dim4 const ostrides  = output.strides();
+    af::dim4 const istrides  = input.strides();
+    af::dim4 const pstrides  = position.strides();
+    af::dim4 const qstrides  = qosition.strides();
+    dim_t const oElems   = output.elements();
+    dim_t const iElems   = input.elements();
+
+    approx2_op<InT, LocT, Method> op;
+    bool pBatch = !(pdims[2] == 1 && pdims[3] == 1);
+
+    for(dim_t w = 0; w < odims[3]; w++) {
+        for(dim_t z = 0; z < odims[2]; z++) {
+            for(dim_t y = 0; y < odims[1]; y++) {
+                for(dim_t x = 0; x < odims[0]; x++) {
+                    op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims,
+                       ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w);
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/assign.hpp b/src/backend/cpu/kernel/assign.hpp
new file mode 100644
index 0000000000..86befaf74e
--- /dev/null
+++ b/src/backend/cpu/kernel/assign.hpp
@@ -0,0 +1,80 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <vector>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void assign(Array<T> out, Array<T> const rhs, std::vector<bool> const isSeq,
+            std::vector<af_seq> const seqs, std::vector< Array<uint> > const idxArrs)
+{
+    af::dim4 dDims = out.getDataDims();
+    af::dim4 pDims = out.dims();
+    // retrieve dimensions & strides for array to which rhs is being copied to
+    af::dim4 dst_offsets = toOffset(seqs, dDims);
+    af::dim4 dst_strides = toStride(seqs, dDims);
+    // retrieve rhs array dimenesions & strides
+    af::dim4 src_dims    = rhs.dims();
+    af::dim4 src_strides = rhs.strides();
+    // declare pointers to af_array index data
+    uint const * const ptr0 = idxArrs[0].get();
+    uint const * const ptr1 = idxArrs[1].get();
+    uint const * const ptr2 = idxArrs[2].get();
+    uint const * const ptr3 = idxArrs[3].get();
+
+    const T * src= rhs.get();
+    T * dst      = out.get();
+
+    for(dim_t l=0; l<src_dims[3]; ++l) {
+
+        dim_t src_loff = l*src_strides[3];
+
+        dim_t dst_lIdx = trimIndex(isSeq[3] ? l+dst_offsets[3] : ptr3[l], pDims[3]);
+        dim_t dst_loff = dst_lIdx * dst_strides[3];
+
+        for(dim_t k=0; k<src_dims[2]; ++k) {
+
+            dim_t src_koff = k*src_strides[2];
+
+            dim_t dst_kIdx = trimIndex(isSeq[2] ? k+dst_offsets[2] : ptr2[k], pDims[2]);
+            dim_t dst_koff = dst_kIdx * dst_strides[2];
+
+            for(dim_t j=0; j<src_dims[1]; ++j) {
+
+                dim_t src_joff = j*src_strides[1];
+
+                dim_t dst_jIdx = trimIndex(isSeq[1] ? j+dst_offsets[1] : ptr1[j], pDims[1]);
+                dim_t dst_joff = dst_jIdx * dst_strides[1];
+
+                for(dim_t i=0; i<src_dims[0]; ++i) {
+
+                    dim_t src_ioff = i*src_strides[0];
+                    dim_t src_idx  = src_ioff + src_joff + src_koff + src_loff;
+
+                    dim_t dst_iIdx = trimIndex(isSeq[0] ? i+dst_offsets[0] : ptr0[i], pDims[0]);
+                    dim_t dst_ioff = dst_iIdx * dst_strides[0];
+                    dim_t dst_idx  = dst_ioff + dst_joff + dst_koff + dst_loff;
+
+                    dst[dst_idx] = src[src_idx];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/bilateral.hpp b/src/backend/cpu/kernel/bilateral.hpp
new file mode 100644
index 0000000000..c950bbd084
--- /dev/null
+++ b/src/backend/cpu/kernel/bilateral.hpp
@@ -0,0 +1,81 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <utility.hpp>
+#include <cmath>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename OutT, typename InT, bool IsColor>
+void bilateral(Array<OutT> out, Array<InT> const in, float const s_sigma, float const c_sigma)
+{
+    af::dim4 const dims     = in.dims();
+    af::dim4 const istrides = in.strides();
+    af::dim4 const ostrides = out.strides();
+
+          OutT *outData = out.get();
+    InT const * inData  = in.get();
+
+    // clamp spatical and chromatic sigma's
+    float space_       = std::min(11.5f, std::max(s_sigma, 0.f));
+    float color_       = std::max(c_sigma, 0.f);
+    dim_t const radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1);
+    float const svar   = space_*space_;
+    float const cvar   = color_*color_;
+
+    for(dim_t b3=0; b3<dims[3]; ++b3) {
+        // b3 for loop handles following batch configurations
+        //  - gfor
+        //  - input based batch
+        //      - when input is 4d array for color images
+        for(dim_t b2=0; b2<dims[2]; ++b2) {
+            // b2 for loop handles following batch configurations
+            //  - channels
+            //  - input based batch
+            //      - when input is 3d array for grayscale images
+            for(dim_t j=0; j<dims[1]; ++j) {
+                // j steps along 2nd dimension
+                for(dim_t i=0; i<dims[0]; ++i) {
+                    // i steps along 1st dimension
+                    OutT norm = 0.0;
+                    OutT res  = 0.0;
+                    OutT const center = (OutT)inData[getIdx(istrides, i, j)];
+                    for(dim_t wj=-radius; wj<=radius; ++wj) {
+                        // clamps offsets
+                        dim_t tj = clamp(j+wj, 0, dims[1]-1);
+                        for(dim_t wi=-radius; wi<=radius; ++wi) {
+                            // clamps offsets
+                            dim_t ti = clamp(i+wi, 0, dims[0]-1);
+                            // proceed
+                            OutT const val= (OutT)inData[getIdx(istrides, ti, tj)];
+                            OutT const gauss_space = (wi*wi+wj*wj)/(-2.0*svar);
+                            OutT const gauss_range = ((center-val)*(center-val))/(-2.0*cvar);
+                            OutT const weight = std::exp(gauss_space+gauss_range);
+                            norm += weight;
+                            res += val*weight;
+                        }
+                    } // filter loop ends here
+
+                    outData[getIdx(ostrides, i, j)] = res/norm;
+                } //1st dimension loop ends here
+            } //2nd dimension loop ends here
+            outData += ostrides[2];
+            inData  += istrides[2];
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/convolve.hpp b/src/backend/cpu/kernel/convolve.hpp
new file mode 100644
index 0000000000..79d684dd64
--- /dev/null
+++ b/src/backend/cpu/kernel/convolve.hpp
@@ -0,0 +1,267 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename InT, typename AccT, bool Expand>
+void one2one_1d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims,
+                af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & sStrides)
+{
+    dim_t start = (Expand ? 0 : fDims[0]/2);
+    dim_t end   = (Expand ? oDims[0] : start + sDims[0]);
+    for(dim_t i=start; i<end; ++i) {
+        AccT accum = 0.0;
+        for(dim_t f=0; f<fDims[0]; ++f) {
+            dim_t iIdx = i-f;
+            InT s_val = ((iIdx>=0 &&iIdx<sDims[0])? iptr[iIdx*sStrides[0]] : InT(0));
+            accum += AccT(s_val * fptr[f]);
+        }
+        optr[i-start] = InT(accum);
+    }
+}
+
+template<typename InT, typename AccT, bool Expand>
+void one2one_2d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims,
+                af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & oStrides,
+                af::dim4 const & sStrides, af::dim4 const & fStrides)
+{
+    dim_t jStart = (Expand ? 0 : fDims[1]/2);
+    dim_t jEnd   = (Expand ? oDims[1] : jStart + sDims[1]);
+    dim_t iStart = (Expand ? 0 : fDims[0]/2);
+    dim_t iEnd   = (Expand ? oDims[0] : iStart + sDims[0]);
+
+    for(dim_t j=jStart; j<jEnd; ++j) {
+        dim_t joff = (j-jStart)*oStrides[1];
+
+        for(dim_t i=iStart; i<iEnd; ++i) {
+
+            AccT accum = AccT(0);
+            for(dim_t wj=0; wj<fDims[1]; ++wj) {
+                dim_t jIdx  = j-wj;
+                dim_t w_joff = wj*fStrides[1];
+                dim_t s_joff = jIdx * sStrides[1];
+                bool isJValid = (jIdx>=0 && jIdx<sDims[1]);
+
+                for(dim_t wi=0; wi<fDims[0]; ++wi) {
+                    dim_t iIdx = i-wi;
+
+                    InT s_val = InT(0);
+                    if ( isJValid && (iIdx>=0 && iIdx<sDims[0])) {
+                        s_val = iptr[s_joff+iIdx*sStrides[0]];
+                    }
+
+                    accum += AccT(s_val * fptr[w_joff+wi*fStrides[0]]);
+                }
+            }
+            optr[joff+i-iStart] = InT(accum);
+        }
+    }
+}
+
+template<typename InT, typename AccT, bool Expand>
+void one2one_3d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims,
+                af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & oStrides,
+                af::dim4 const & sStrides, af::dim4 const & fStrides)
+{
+    dim_t kStart = (Expand ? 0 : fDims[2]/2);
+    dim_t kEnd   = (Expand ? oDims[2] : kStart + sDims[2]);
+    dim_t jStart = (Expand ? 0 : fDims[1]/2);
+    dim_t jEnd   = (Expand ? oDims[1] : jStart + sDims[1]);
+    dim_t iStart = (Expand ? 0 : fDims[0]/2);
+    dim_t iEnd   = (Expand ? oDims[0] : iStart + sDims[0]);
+
+    for(dim_t k=kStart; k<kEnd; ++k) {
+        dim_t koff = (k-kStart)*oStrides[2];
+
+        for(dim_t j=jStart; j<jEnd; ++j) {
+            dim_t joff = (j-jStart)*oStrides[1];
+
+            for(dim_t i=iStart; i<iEnd; ++i) {
+
+                AccT accum = AccT(0);
+                for(dim_t wk=0; wk<fDims[2]; ++wk) {
+                    dim_t kIdx  = k-wk;
+                    dim_t w_koff = wk*fStrides[2];
+                    dim_t s_koff = kIdx * sStrides[2];
+                    bool isKValid = (kIdx>=0 && kIdx<sDims[2]);
+
+                    for(dim_t wj=0; wj<fDims[1]; ++wj) {
+                        dim_t jIdx  = j-wj;
+                        dim_t w_joff = wj*fStrides[1];
+                        dim_t s_joff = jIdx * sStrides[1];
+                        bool isJValid = (jIdx>=0 && jIdx<sDims[1]);
+
+                        for(dim_t wi=0; wi<fDims[0]; ++wi) {
+                            dim_t iIdx = i-wi;
+
+                            InT s_val = InT(0);
+                            if ( isKValid && isJValid && (iIdx>=0 && iIdx<sDims[0])) {
+                                s_val = iptr[s_koff+s_joff+iIdx*sStrides[0]];
+                            }
+
+                            accum += AccT(s_val * fptr[w_koff+w_joff+wi*fStrides[0]]);
+                        }
+                    }
+                }
+                optr[koff+joff+i-iStart] = InT(accum);
+            } //i loop ends here
+        } // j loop ends here
+    } // k loop ends here
+}
+
+template<typename InT, typename AccT, dim_t baseDim, bool Expand>
+void convolve_nd(Array<InT> out, Array<InT> const signal, Array<AccT> const filter, ConvolveBatchKind kind)
+{
+    InT * optr = out.get();
+    InT const * const iptr = signal.get();
+    AccT const * const fptr = filter.get();
+
+    af::dim4 const oDims = out.dims();
+    af::dim4 const sDims = signal.dims();
+    af::dim4 const fDims = filter.dims();
+
+    af::dim4 const oStrides = out.strides();
+    af::dim4 const sStrides = signal.strides();
+    af::dim4 const fStrides = filter.strides();
+
+    dim_t out_step[4]  = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
+    dim_t in_step[4]   = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
+    dim_t filt_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
+    dim_t batch[4]     = {0, 1, 1, 1}; /* first value is never used, and declared for code simplicity */
+
+    for (dim_t i=1; i<4; ++i) {
+        switch(kind) {
+            case CONVOLVE_BATCH_SIGNAL:
+                out_step[i] = oStrides[i];
+                in_step[i]  = sStrides[i];
+                if (i>=baseDim) batch[i] = sDims[i];
+                break;
+            case CONVOLVE_BATCH_SAME:
+                out_step[i]  = oStrides[i];
+                in_step[i]   = sStrides[i];
+                filt_step[i] = fStrides[i];
+                if (i>=baseDim) batch[i] = sDims[i];
+                break;
+            case CONVOLVE_BATCH_KERNEL:
+                out_step[i]  = oStrides[i];
+                filt_step[i] = fStrides[i];
+                if (i>=baseDim) batch[i] = fDims[i];
+                break;
+            default:
+                break;
+        }
+    }
+
+    for (dim_t b3=0; b3<batch[3]; ++b3) {
+        for (dim_t b2=0; b2<batch[2]; ++b2) {
+            for (dim_t b1=0; b1<batch[1]; ++b1) {
+
+                InT * out          = optr + b1 * out_step[1] + b2 * out_step[2] + b3 * out_step[3];
+                InT const *in      = iptr + b1 *  in_step[1] + b2 *  in_step[2] + b3 *  in_step[3];
+                AccT const *filt = fptr + b1 *filt_step[1] + b2 *filt_step[2] + b3 *filt_step[3];
+
+                switch(baseDim) {
+                    case 1: one2one_1d<InT, AccT, Expand>(out, in, filt, oDims, sDims, fDims, sStrides);                     break;
+                    case 2: one2one_2d<InT, AccT, Expand>(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break;
+                    case 3: one2one_3d<InT, AccT, Expand>(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break;
+                }
+            }
+        }
+    }
+}
+
+template<typename InT, typename AccT, dim_t conv_dim, bool Expand>
+void convolve2_separable(InT *optr, InT const * const iptr, AccT const * const fptr,
+                        af::dim4 const & oDims, af::dim4 const & sDims, af::dim4 const & orgDims, dim_t fDim,
+                        af::dim4 const & oStrides, af::dim4 const & sStrides, dim_t fStride)
+{
+    for(dim_t j=0; j<oDims[1]; ++j) {
+
+        dim_t jOff = j*oStrides[1];
+        dim_t cj = j + (conv_dim==1)*(Expand ? 0: fDim>>1);
+
+        for(dim_t i=0; i<oDims[0]; ++i) {
+
+            dim_t iOff = i*oStrides[0];
+            dim_t ci = i + (conv_dim==0)*(Expand ? 0 : fDim>>1);
+
+            AccT accum = scalar<AccT>(0);
+
+            for(dim_t f=0; f<fDim; ++f) {
+                InT f_val = fptr[f];
+                InT s_val;
+
+                if (conv_dim==0) {
+                    dim_t offi = ci - f;
+                    bool isCIValid = offi>=0 && offi<sDims[0];
+                    bool isCJValid = cj>=0 && cj<sDims[1];
+                    s_val = (isCJValid && isCIValid ? iptr[cj*sDims[0]+offi] : scalar<InT>(0));
+                } else {
+                    dim_t offj = cj - f;
+                    bool isCIValid = ci>=0 && ci<sDims[0];
+                    bool isCJValid = offj>=0 && offj<sDims[1];
+                    s_val = (isCJValid && isCIValid ? iptr[offj*sDims[0]+ci] : scalar<InT>(0));
+                }
+
+                accum += AccT(s_val * f_val);
+            }
+            optr[iOff+jOff] = InT(accum);
+        }
+    }
+}
+
+template<typename InT, typename AccT, bool Expand>
+void convolve2(Array<InT> out, Array<InT> const signal,
+               Array<AccT> const c_filter, Array<AccT> const r_filter,
+               af::dim4 const tDims)
+{
+    Array<InT> temp = createEmptyArray<InT>(tDims);
+
+    dim_t cflen = (dim_t)c_filter.elements();
+    dim_t rflen = (dim_t)r_filter.elements();
+
+    auto oDims = out.dims();
+    auto sDims = signal.dims();
+
+    auto oStrides = out.strides();
+    auto sStrides = signal.strides();
+    auto tStrides = temp.strides();
+
+    for (dim_t b3=0; b3<oDims[3]; ++b3) {
+
+        dim_t i_b3Off = b3*sStrides[3];
+        dim_t t_b3Off = b3*tStrides[3];
+        dim_t o_b3Off = b3*oStrides[3];
+
+        for (dim_t b2=0; b2<oDims[2]; ++b2) {
+
+            InT const * const iptr = signal.get()+ b2*sStrides[2] + i_b3Off;
+            InT *tptr = temp.get() + b2*tStrides[2] + t_b3Off;
+            InT *optr = out.get()  + b2*oStrides[2] + o_b3Off;
+
+            convolve2_separable<InT, AccT, 0, Expand>(tptr, iptr, c_filter.get(),
+                    tDims, sDims, sDims, cflen,
+                    tStrides, sStrides, c_filter.strides()[0]);
+
+            convolve2_separable<InT, AccT, 1, Expand>(optr, tptr, r_filter.get(),
+                    oDims, tDims, sDims, rflen,
+                    oStrides, tStrides, r_filter.strides()[0]);
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/copy.hpp b/src/backend/cpu/kernel/copy.hpp
new file mode 100644
index 0000000000..70d6705ec2
--- /dev/null
+++ b/src/backend/cpu/kernel/copy.hpp
@@ -0,0 +1,90 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void stridedCopy(T* dst, af::dim4 const & ostrides, T const * src,
+                 af::dim4 const & dims, af::dim4 const & strides, unsigned dim)
+{
+    if(dim == 0) {
+        if(strides[dim] == 1) {
+            //FIXME: Check for errors / exceptions
+            memcpy(dst, src, dims[dim] * sizeof(T));
+        } else {
+            for(dim_t i = 0; i < dims[dim]; i++) {
+                dst[i] = src[strides[dim]*i];
+            }
+        }
+    } else {
+        for(dim_t i = dims[dim]; i > 0; i--) {
+            stridedCopy<T>(dst, ostrides, src, dims, strides, dim - 1);
+            src += strides[dim];
+            dst += ostrides[dim];
+        }
+    }
+}
+
+template<typename OutT, typename InT>
+void copy(Array<OutT> dst, Array<InT> const src, OutT default_value, double factor)
+{
+    af::dim4 src_dims       = src.dims();
+    af::dim4 dst_dims       = dst.dims();
+    af::dim4 src_strides    = src.strides();
+    af::dim4 dst_strides    = dst.strides();
+
+    InT const * const src_ptr = src.get();
+    OutT * dst_ptr      = dst.get();
+
+    dim_t trgt_l = std::min(dst_dims[3], src_dims[3]);
+    dim_t trgt_k = std::min(dst_dims[2], src_dims[2]);
+    dim_t trgt_j = std::min(dst_dims[1], src_dims[1]);
+    dim_t trgt_i = std::min(dst_dims[0], src_dims[0]);
+
+    for(dim_t l=0; l<dst_dims[3]; ++l) {
+
+        dim_t src_loff = l*src_strides[3];
+        dim_t dst_loff = l*dst_strides[3];
+        bool isLvalid = l<trgt_l;
+
+        for(dim_t k=0; k<dst_dims[2]; ++k) {
+
+            dim_t src_koff = k*src_strides[2];
+            dim_t dst_koff = k*dst_strides[2];
+            bool isKvalid = k<trgt_k;
+
+            for(dim_t j=0; j<dst_dims[1]; ++j) {
+
+                dim_t src_joff = j*src_strides[1];
+                dim_t dst_joff = j*dst_strides[1];
+                bool isJvalid = j<trgt_j;
+
+                for(dim_t i=0; i<dst_dims[0]; ++i) {
+                    OutT temp = default_value;
+                    if (isLvalid && isKvalid && isJvalid && i<trgt_i) {
+                        dim_t src_idx = i*src_strides[0] + src_joff + src_koff + src_loff;
+                        temp = OutT(src_ptr[src_idx])*OutT(factor);
+                    }
+                    dim_t dst_idx = i*dst_strides[0] + dst_joff + dst_koff + dst_loff;
+                    dst_ptr[dst_idx] = temp;
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/diagonal.hpp b/src/backend/cpu/kernel/diagonal.hpp
new file mode 100644
index 0000000000..0c81fc90f2
--- /dev/null
+++ b/src/backend/cpu/kernel/diagonal.hpp
@@ -0,0 +1,67 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void diagCreate(Array<T> out, Array<T> const in, int const num)
+{
+    int batch = in.dims()[1];
+    int size  = out.dims()[0];
+
+    T const * iptr = in.get();
+    T * optr = out.get();
+
+    for (int k = 0; k < batch; k++) {
+        for (int j = 0; j < size; j++) {
+            for (int i = 0; i < size; i++) {
+                T val = scalar<T>(0);
+                if (i == j - num) {
+                    val = (num > 0) ? iptr[i] : iptr[j];
+                }
+                optr[i + j * out.strides()[1]] = val;
+            }
+        }
+        optr += out.strides()[2];
+        iptr += in.strides()[1];
+    }
+}
+
+template<typename T>
+void diagExtract(Array<T> out, Array<T> const in, int const num)
+{
+    dim4 const odims = out.dims();
+    dim4 const idims = in.dims();
+
+    int const i_off = (num > 0) ? (num * in.strides()[1]) : (-num);
+
+    for (int l = 0; l < (int)odims[3]; l++) {
+
+        for (int k = 0; k < (int)odims[2]; k++) {
+            const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off;
+            T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2];
+
+            for (int i = 0; i < (int)odims[0]; i++) {
+                T val = scalar<T>(0);
+                if (i < idims[0] && i < idims[1]) val =  iptr[i * in.strides()[1] + i];
+                optr[i] = val;
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/diff.hpp b/src/backend/cpu/kernel/diff.hpp
new file mode 100644
index 0000000000..1a3d7ba110
--- /dev/null
+++ b/src/backend/cpu/kernel/diff.hpp
@@ -0,0 +1,86 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void diff1(Array<T> out, Array<T> const in, int const dim)
+{
+    af::dim4 dims = out.dims();
+    // Bool for dimension
+    bool is_dim0 = dim == 0;
+    bool is_dim1 = dim == 1;
+    bool is_dim2 = dim == 2;
+    bool is_dim3 = dim == 3;
+
+    T const * const inPtr = in.get();
+    T * outPtr = out.get();
+
+    // TODO: Improve this
+    for(dim_t l = 0; l < dims[3]; l++) {
+        for(dim_t k = 0; k < dims[2]; k++) {
+            for(dim_t j = 0; j < dims[1]; j++) {
+                for(dim_t i = 0; i < dims[0]; i++) {
+                    // Operation: out[index] = in[index + 1 * dim_size] - in[index]
+                    int idx = getIdx(in.strides(), i, j, k, l);
+                    int jdx = getIdx(in.strides(),
+                            i + is_dim0, j + is_dim1,
+                            k + is_dim2, l + is_dim3);
+                    int odx = getIdx(out.strides(), i, j, k, l);
+                    outPtr[odx] = inPtr[jdx] - inPtr[idx];
+                }
+            }
+        }
+    }
+}
+
+template<typename T>
+void diff2(Array<T> out, Array<T> const in, int const dim)
+{
+    af::dim4 dims = out.dims();
+    // Bool for dimension
+    bool is_dim0 = dim == 0;
+    bool is_dim1 = dim == 1;
+    bool is_dim2 = dim == 2;
+    bool is_dim3 = dim == 3;
+
+    T const * const inPtr = in.get();
+    T * outPtr = out.get();
+
+    // TODO: Improve this
+    for(dim_t l = 0; l < dims[3]; l++) {
+        for(dim_t k = 0; k < dims[2]; k++) {
+            for(dim_t j = 0; j < dims[1]; j++) {
+                for(dim_t i = 0; i < dims[0]; i++) {
+                    // Operation: out[index] = in[index + 1 * dim_size] - in[index]
+                    int idx = getIdx(in.strides(), i, j, k, l);
+                    int jdx = getIdx(in.strides(),
+                            i + is_dim0, j + is_dim1,
+                            k + is_dim2, l + is_dim3);
+                    int kdx = getIdx(in.strides(),
+                            i + 2 * is_dim0, j + 2 * is_dim1,
+                            k + 2 * is_dim2, l + 2 * is_dim3);
+                    int odx = getIdx(out.strides(), i, j, k, l);
+                    outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/dot.hpp b/src/backend/cpu/kernel/dot.hpp
new file mode 100644
index 0000000000..71f2c6f959
--- /dev/null
+++ b/src/backend/cpu/kernel/dot.hpp
@@ -0,0 +1,46 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <complex>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T> T
+conj(T  x) { return x; }
+
+template<> cfloat  conj<cfloat> (cfloat  c) { return std::conj(c); }
+template<> cdouble conj<cdouble>(cdouble c) { return std::conj(c); }
+
+template<typename T, bool conjugate, bool both_conjugate>
+void dot(Array<T> output, const Array<T> lhs, const Array<T> rhs,
+         af_mat_prop optLhs, af_mat_prop optRhs)
+{
+    int N = lhs.dims()[0];
+
+    T out = 0;
+    const T *pL = lhs.get();
+    const T *pR = rhs.get();
+
+    for(int i = 0; i < N; i++)
+        out += (conjugate ? kernel::conj(pL[i]) : pL[i]) * pR[i];
+
+    if(both_conjugate) out = kernel::conj(out);
+
+    *output.get() = out;
+
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/fast.hpp b/src/backend/cpu/kernel/fast.hpp
new file mode 100644
index 0000000000..02da3e4d33
--- /dev/null
+++ b/src/backend/cpu/kernel/fast.hpp
@@ -0,0 +1,224 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+inline int idx_y(int i)
+{
+    if (i >= 8)
+        return clamp(-(i-8-4), -3, 3);
+
+    return clamp(i-4, -3, 3);
+}
+
+inline int idx_x(int i)
+{
+    if (i < 12)
+        return idx_y(i+4);
+
+    return idx_y(i-12);
+}
+
+inline int idx(int y, int x, unsigned idim0)
+{
+    return x * idim0 + y;
+}
+
+// test_greater()
+// Tests if a pixel x > p + thr
+inline int test_greater(float x, float p, float thr)
+{
+    return (x >= p + thr);
+}
+
+// test_smaller()
+// Tests if a pixel x < p - thr
+inline int test_smaller(float x, float p, float thr)
+{
+    return (x <= p - thr);
+}
+
+// test_pixel()
+// Returns -1 when x < p - thr
+// Returns  0 when x >= p - thr && x <= p + thr
+// Returns  1 when x > p + thr
+template<typename T>
+inline int test_pixel(const T* image, const float p, float thr, int y, int x, unsigned idim0)
+{
+    return -test_smaller((float)image[idx(y,x,idim0)], p, thr) | test_greater((float)image[idx(y,x,idim0)], p, thr);
+}
+
+// abs_diff()
+// Returns absolute difference of x and y
+inline int abs_diff(int x, int y)
+{
+    return abs(x - y);
+}
+inline unsigned abs_diff(unsigned x, unsigned y)
+{
+    return (unsigned)abs((int)x - (int)y);
+}
+inline float abs_diff(float x, float y)
+{
+    return fabs(x - y);
+}
+inline double abs_diff(double x, double y)
+{
+    return fabs(x - y);
+}
+
+template<typename T>
+void locate_features(Array<T> const & in, Array<float> & score,
+                     Array<float> & x_out, Array<float> & y_out,
+                     Array<float> & score_out, unsigned* count, float const thr,
+                     unsigned const arc_length, unsigned const nonmax,
+                     unsigned const max_feat, unsigned const edge)
+{
+    af::dim4 in_dims = in.dims();
+    T const * in_ptr = in.get();
+
+    for (int y = edge; y < (int)(in_dims[0] - edge); y++) {
+        for (int x = edge; x < (int)(in_dims[1] - edge); x++) {
+            float p = in_ptr[idx(y, x, in_dims[0])];
+
+            // Start by testing opposite pixels of the circle that will result in
+            // a non-kepoint
+            int d;
+            d  = test_pixel<T>(in_ptr, p, thr, y-3,   x, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+3,   x, in_dims[0]);
+            if (d == 0)
+                continue;
+
+            d &= test_pixel<T>(in_ptr, p, thr, y-2, x+2, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+2, x-2, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y  , x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y  , x-3, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y+2, x+2, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-2, x-2, in_dims[0]);
+            if (d == 0)
+                continue;
+
+            d &= test_pixel<T>(in_ptr, p, thr, y-3, x+1, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+3, x-1, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y-1, x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+1, x-3, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y+1, x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-1, x-3, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y+3, x+1, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-3, x-1, in_dims[0]);
+            if (d == 0)
+                continue;
+
+            int sum = 0;
+
+            // Sum responses [-1, 0 or 1] of first arc_length pixels
+            for (int i = 0; i < static_cast<int>(arc_length); i++)
+                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
+
+            // Test maximum and mininmum responses of first segment of arc_length
+            // pixels
+            int max_sum = 0, min_sum = 0;
+            max_sum = std::max(max_sum, sum);
+            min_sum = std::min(min_sum, sum);
+
+            // Sum responses and test the remaining 16-arc_length pixels of the circle
+            for (int i = arc_length; i < 16; i++) {
+                sum -= test_pixel<T>(in_ptr, p, thr, y+idx_y(i-arc_length), x+idx_x(i-arc_length), in_dims[0]);
+                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
+                max_sum = std::max(max_sum, sum);
+                min_sum = std::min(min_sum, sum);
+            }
+
+            // To completely test all possible segments, it's necessary to test
+            // segments that include the top junction of the circle
+            for (int i = 0; i < static_cast<int>(arc_length-1); i++) {
+                sum -= test_pixel<T>(in_ptr, p, thr, y+idx_y(16-arc_length+i), x+idx_x(16-arc_length+i), in_dims[0]);
+                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
+                max_sum = std::max(max_sum, sum);
+                min_sum = std::min(min_sum, sum);
+            }
+
+            float s_bright = 0, s_dark = 0;
+            for (int i = 0; i < 16; i++) {
+                float p_x = (float)in_ptr[idx(y+idx_y(i), x+idx_x(i), in_dims[0])];
+
+                s_bright += test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr);
+                s_dark   += test_smaller(p_x, p, thr) * (abs_diff(p, p_x) - thr);
+            }
+
+            // If sum at some point was equal to (+-)arc_length, there is a segment
+            // that for which all pixels are much brighter or much brighter than
+            // central pixel p.
+            if (max_sum == static_cast<int>(arc_length) || min_sum == -static_cast<int>(arc_length)) {
+                unsigned j = *count;
+                ++*count;
+                if (j < max_feat) {
+                    float *x_out_ptr = x_out.get();
+                    float *y_out_ptr = y_out.get();
+                    float *score_out_ptr = score_out.get();
+                    x_out_ptr[j]     = static_cast<float>(x);
+                    y_out_ptr[j]     = static_cast<float>(y);
+                    score_out_ptr[j] = static_cast<float>(std::max(s_bright, s_dark));
+                    if (nonmax == 1) {
+                        float* score_ptr = score.get();
+                        score_ptr[idx(y, x, in_dims[0])] = std::max(s_bright, s_dark);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void non_maximal(Array<float> const & score, const Array<float> & x_in, const Array<float> & y_in,
+                 Array<float> & x_out, Array<float> & y_out, Array<float> & score_out,
+                 unsigned* count, unsigned const total_feat, unsigned const edge)
+{
+    float const * score_ptr = score.get();
+    float const * x_in_ptr = x_in.get();
+    float const * y_in_ptr = y_in.get();
+
+    af::dim4 score_dims = score.dims();
+
+    for (unsigned k = 0; k < total_feat; k++) {
+        unsigned x = static_cast<unsigned>(round(x_in_ptr[k]));
+        unsigned y = static_cast<unsigned>(round(y_in_ptr[k]));
+
+        float v = score_ptr[y + score_dims[0] * x];
+        float max_v;
+        max_v = std::max(score_ptr[y-1 + score_dims[0] * (x-1)], score_ptr[y-1 + score_dims[0] * x]);
+        max_v = std::max(max_v, score_ptr[y-1 + score_dims[0] * (x+1)]);
+        max_v = std::max(max_v, score_ptr[y   + score_dims[0] * (x-1)]);
+        max_v = std::max(max_v, score_ptr[y   + score_dims[0] * (x+1)]);
+        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x-1)]);
+        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x)  ]);
+        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x+1)]);
+
+        if (y >= score_dims[1] - edge - 1 || y <= edge + 1 ||
+            x >= score_dims[0] - edge - 1 || x <= edge + 1)
+            continue;
+
+        // Stores keypoint to feat_out if it's response is maximum compared to
+        // its 8-neighborhood
+        if (v > max_v) {
+            unsigned j = *count;
+            ++*count;
+
+            float *x_out_ptr = x_out.get();
+            float *y_out_ptr = y_out.get();
+            float *score_out_ptr = score_out.get();
+
+            x_out_ptr[j]     = static_cast<float>(x);
+            y_out_ptr[j]     = static_cast<float>(y);
+            score_out_ptr[j] = static_cast<float>(v);
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/fft.hpp b/src/backend/cpu/kernel/fft.hpp
new file mode 100644
index 0000000000..906c8ef5f5
--- /dev/null
+++ b/src/backend/cpu/kernel/fft.hpp
@@ -0,0 +1,192 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <Array.hpp>
+#include <fftw3.h>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<int rank>
+void computeDims(int rdims[rank], const af::dim4 &idims)
+{
+    for (int i = 0; i < rank; i++) {
+        rdims[i] = idims[(rank -1) - i];
+    }
+}
+
+template<typename T>
+struct fftw_transform;
+
+#define TRANSFORM(PRE, TY)                                              \
+    template<>                                                          \
+    struct fftw_transform<TY>                                           \
+    {                                                                   \
+        typedef PRE##_plan plan_t;                                      \
+        typedef PRE##_complex ctype_t;                                  \
+                                                                        \
+        template<typename... Args>                                      \
+            plan_t create(Args... args)                                 \
+        { return PRE##_plan_many_dft(args...); }                        \
+        void execute(plan_t plan) { return PRE##_execute(plan); }       \
+        void destroy(plan_t plan) { return PRE##_destroy_plan(plan); }  \
+    };                                                                  \
+
+
+TRANSFORM(fftwf, cfloat)
+TRANSFORM(fftw, cdouble)
+
+template<typename To, typename Ti>
+struct fftw_real_transform;
+
+#define TRANSFORM_REAL(PRE, To, Ti, POST)                               \
+    template<>                                                          \
+    struct fftw_real_transform<To, Ti>                                  \
+    {                                                                   \
+        typedef PRE##_plan plan_t;                                      \
+        typedef PRE##_complex ctype_t;                                  \
+                                                                        \
+        template<typename... Args>                                      \
+            plan_t create(Args... args)                                 \
+        { return PRE##_plan_many_dft_##POST(args...); }                 \
+        void execute(plan_t plan) { return PRE##_execute(plan); }       \
+        void destroy(plan_t plan) { return PRE##_destroy_plan(plan); }  \
+    };                                                                  \
+
+
+TRANSFORM_REAL(fftwf, cfloat , float , r2c)
+TRANSFORM_REAL(fftw , cdouble, double, r2c)
+TRANSFORM_REAL(fftwf, float , cfloat , c2r)
+TRANSFORM_REAL(fftw , double, cdouble, c2r)
+
+
+template<typename T, int rank, bool direction>
+void fft_inplace(Array<T> in)
+{
+    int t_dims[rank];
+    int in_embed[rank];
+
+    const af::dim4 idims = in.dims();
+
+    computeDims<rank>(t_dims  , idims);
+    computeDims<rank>(in_embed , in.getDataDims());
+
+    const af::dim4 istrides = in.strides();
+
+    typedef typename fftw_transform<T>::ctype_t ctype_t;
+    typename fftw_transform<T>::plan_t plan;
+
+    fftw_transform<T> transform;
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= idims[i];
+    }
+
+    plan = transform.create(rank,
+                            t_dims,
+                            (int)batch,
+                            (ctype_t *)in.get(),
+                            in_embed, (int)istrides[0],
+                            (int)istrides[rank],
+                            (ctype_t *)in.get(),
+                            in_embed, (int)istrides[0],
+                            (int)istrides[rank],
+                            direction ? FFTW_FORWARD : FFTW_BACKWARD,
+                            FFTW_ESTIMATE);
+
+    transform.execute(plan);
+    transform.destroy(plan);
+}
+
+template<typename Tc, typename Tr, int rank>
+void fft_r2c(Array<Tc> out, const Array<Tr> in)
+{
+    af::dim4 idims = in.dims();
+
+    int t_dims[rank];
+    int in_embed[rank];
+    int out_embed[rank];
+
+    computeDims<rank>(t_dims  , idims);
+    computeDims<rank>(in_embed , in.getDataDims());
+    computeDims<rank>(out_embed , out.getDataDims());
+
+    const af::dim4 istrides = in.strides();
+    const af::dim4 ostrides = out.strides();
+
+    typedef typename fftw_real_transform<Tc, Tr>::ctype_t ctype_t;
+    typename fftw_real_transform<Tc, Tr>::plan_t plan;
+
+    fftw_real_transform<Tc, Tr> transform;
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= idims[i];
+    }
+
+    plan = transform.create(rank,
+                            t_dims,
+                            (int)batch,
+                            (Tr *)in.get(),
+                            in_embed, (int)istrides[0],
+                            (int)istrides[rank],
+                            (ctype_t *)out.get(),
+                            out_embed, (int)ostrides[0],
+                            (int)ostrides[rank],
+                            FFTW_ESTIMATE);
+
+    transform.execute(plan);
+    transform.destroy(plan);
+}
+
+template<typename Tr, typename Tc, int rank>
+void fft_c2r(Array<Tr> out, const Array<Tc> in, const af::dim4 odims)
+{
+    int t_dims[rank];
+    int in_embed[rank];
+    int out_embed[rank];
+
+    computeDims<rank>(t_dims  , odims);
+    computeDims<rank>(in_embed , in.getDataDims());
+    computeDims<rank>(out_embed , out.getDataDims());
+
+    const af::dim4 istrides = in.strides();
+    const af::dim4 ostrides = out.strides();
+
+    typedef typename fftw_real_transform<Tr, Tc>::ctype_t ctype_t;
+    typename fftw_real_transform<Tr, Tc>::plan_t plan;
+
+    fftw_real_transform<Tr, Tc> transform;
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= odims[i];
+    }
+
+    plan = transform.create(rank,
+                            t_dims,
+                            (int)batch,
+                            (ctype_t *)in.get(),
+                            in_embed, (int)istrides[0],
+                            (int)istrides[rank],
+                            (Tr *)out.get(),
+                            out_embed, (int)ostrides[0],
+                            (int)ostrides[rank],
+                            FFTW_ESTIMATE);
+
+    transform.execute(plan);
+    transform.destroy(plan);
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/fftconvolve.hpp b/src/backend/cpu/kernel/fftconvolve.hpp
new file mode 100644
index 0000000000..ad586f7d28
--- /dev/null
+++ b/src/backend/cpu/kernel/fftconvolve.hpp
@@ -0,0 +1,256 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <convolve_common.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename To, typename Ti>
+void packData(Array<To> out, const af::dim4 od, const af::dim4 os, Array<Ti> const in)
+{
+    To* out_ptr = out.get();
+
+    const af::dim4 id = in.dims();
+    const af::dim4 is = in.strides();
+    const Ti* in_ptr = in.get();
+
+    int id0_half = divup(id[0], 2);
+    bool odd_id0 = (id[0] % 2 == 1);
+
+    for (int d3 = 0; d3 < (int)od[3]; d3++) {
+        for (int d2 = 0; d2 < (int)od[2]; d2++) {
+            for (int d1 = 0; d1 < (int)od[1]; d1++) {
+                for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
+                    const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+
+                    if (d0 < (int)id0_half && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) {
+                        const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0;
+                        out_ptr[oidx]   = (To)in_ptr[iidx];
+                        if (d0 == id0_half-1 && odd_id0)
+                            out_ptr[oidx+1] = (To)0;
+                        else
+                            out_ptr[oidx+1] = (To)in_ptr[iidx+id0_half];
+                    }
+                    else {
+                        // Pad remaining elements with 0s
+                        out_ptr[oidx]   = (To)0;
+                        out_ptr[oidx+1] = (To)0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename To, typename Ti>
+void padArray(Array<To> out, const af::dim4 od, const af::dim4 os,
+              Array<Ti> const in, const dim_t offset)
+{
+    To* out_ptr = out.get() + offset;
+    const af::dim4 id = in.dims();
+    const af::dim4 is = in.strides();
+    const Ti* in_ptr = in.get();
+
+    for (int d3 = 0; d3 < (int)od[3]; d3++) {
+        for (int d2 = 0; d2 < (int)od[2]; d2++) {
+            for (int d1 = 0; d1 < (int)od[1]; d1++) {
+                for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
+                    const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+
+                    if (d0 < (int)id[0] && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) {
+                        // Copy input elements to real elements, set imaginary elements to 0
+                        const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0;
+                        out_ptr[oidx]   = (To)in_ptr[iidx];
+                        out_ptr[oidx+1] = (To)0;
+                    }
+                    else {
+                        // Pad remaining of the matrix to 0s
+                        out_ptr[oidx]   = (To)0;
+                        out_ptr[oidx+1] = (To)0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename T>
+void complexMultiply(Array<T> packed, const af::dim4 sig_dims, const af::dim4 sig_strides,
+                     const af::dim4 fit_dims, const af::dim4 fit_strides,
+                     ConvolveBatchKind kind, const dim_t offset)
+{
+    T* out_ptr = packed.get() + (kind==CONVOLVE_BATCH_KERNEL? offset : 0);
+    T* in1_ptr = packed.get();
+    T* in2_ptr = packed.get() + offset;
+
+    const af::dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims);
+    const af::dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides);
+    const af::dim4& i1d = sig_dims;
+    const af::dim4& i2d = fit_dims;
+    const af::dim4& i1s = sig_strides;
+    const af::dim4& i2s = fit_strides;
+
+    for (int d3 = 0; d3 < (int)od[3]; d3++) {
+        for (int d2 = 0; d2 < (int)od[2]; d2++) {
+            for (int d1 = 0; d1 < (int)od[1]; d1++) {
+                for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
+                    if (kind == CONVOLVE_BATCH_NONE || kind == CONVOLVE_BATCH_SAME) {
+                        // Complex multiply each signal to equivalent filter
+                        const int ridx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+                        const int iidx = ridx + 1;
+
+                        T a = in1_ptr[ridx];
+                        T b = in1_ptr[iidx];
+                        T c = in2_ptr[ridx];
+                        T d = in2_ptr[iidx];
+
+                        T ac = a*c;
+                        T bd = b*d;
+
+                        out_ptr[ridx] = ac - bd;
+                        out_ptr[iidx] = (a+b) * (c+d) - ac - bd;
+                    }
+                    else if (kind == CONVOLVE_BATCH_SIGNAL) {
+                        // Complex multiply all signals to filter
+                        const int ridx1 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+                        const int iidx1 = ridx1 + 1;
+                        const int ridx2 = ridx1 % (i2s[3] * i2d[3]);
+                        const int iidx2 = iidx1 % (i2s[3] * i2d[3]);
+
+                        T a = in1_ptr[ridx1];
+                        T b = in1_ptr[iidx1];
+                        T c = in2_ptr[ridx2];
+                        T d = in2_ptr[iidx2];
+
+                        T ac = a*c;
+                        T bd = b*d;
+
+                        out_ptr[ridx1] = ac - bd;
+                        out_ptr[iidx1] = (a+b) * (c+d) - ac - bd;
+                    }
+                    else if (kind == CONVOLVE_BATCH_KERNEL) {
+                        // Complex multiply signal to all filters
+                        const int ridx2 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+                        const int iidx2 = ridx2 + 1;
+                        const int ridx1 = ridx2 % (i1s[3] * i1d[3]);
+                        const int iidx1 = iidx2 % (i1s[3] * i1d[3]);
+
+                        T a = in1_ptr[ridx1];
+                        T b = in1_ptr[iidx1];
+                        T c = in2_ptr[ridx2];
+                        T d = in2_ptr[iidx2];
+
+                        T ac = a*c;
+                        T bd = b*d;
+
+                        out_ptr[ridx2] = ac - bd;
+                        out_ptr[iidx2] = (a+b) * (c+d) - ac - bd;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename To, typename Ti, bool roundOut>
+void reorderHelper(To* out_ptr, const af::dim4& od, const af::dim4& os,
+                   const Ti* in_ptr, const af::dim4& id, const af::dim4& is,
+                   const af::dim4& fd, const int half_di0, const int baseDim,
+                   const int fftScale, const bool expand)
+{
+    for (int d3 = 0; d3 < (int)od[3]; d3++) {
+        for (int d2 = 0; d2 < (int)od[2]; d2++) {
+            for (int d1 = 0; d1 < (int)od[1]; d1++) {
+                for (int d0 = 0; d0 < (int)od[0]; d0++) {
+                    int id0, id1, id2, id3;
+                    if (expand) {
+                        id0 = d0;
+                        id1 = d1 * is[1];
+                        id2 = d2 * is[2];
+                        id3 = d3 * is[3];
+                    }
+                    else {
+                        id0 = d0 + fd[0]/2;
+                        id1 = (d1 + (baseDim > 1)*(fd[1]/2)) * is[1];
+                        id2 = (d2 + (baseDim > 2)*(fd[2]/2)) * is[2];
+                        id3 = d3 * is[3];
+                    }
+
+                    int oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0;
+
+                    // Divide output elements to cuFFT resulting scale, round result if output
+                    // type is single or double precision floating-point
+                    if (id0 < half_di0) {
+                        // Copy top elements
+                        int iidx = id3 + id2 + id1 + id0 * 2;
+                        if (roundOut)
+                            out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale));
+                        else
+                            out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale);
+                    }
+                    else if (id0 < half_di0 + (int)fd[0] - 1) {
+                        // Add signal and filter elements to central part
+                        int iidx1 = id3 + id2 + id1 + id0 * 2;
+                        int iidx2 = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1;
+                        if (roundOut)
+                            out_ptr[oidx] = (To)roundf((float)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale));
+                        else
+                            out_ptr[oidx] = (To)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale);
+                    }
+                    else {
+                        // Copy bottom elements
+                        const int iidx = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1;
+                        if (roundOut)
+                            out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale));
+                        else
+                            out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename T, typename convT, bool roundOut, int baseDim>
+void reorder(Array<T> out, Array<convT> packed,
+             const Array<T> filter, const dim_t sig_half_d0, const dim_t fftScale,
+             const dim4 sig_tmp_dims, const dim4 sig_tmp_strides,
+             const dim4 filter_tmp_dims, const dim4 filter_tmp_strides,
+             bool expand, ConvolveBatchKind kind)
+{
+    T* out_ptr = out.get();
+    const af::dim4 out_dims = out.dims();
+    const af::dim4 out_strides = out.strides();
+
+    const af::dim4 filter_dims = filter.dims();
+
+    convT* packed_ptr = packed.get();
+    convT* sig_tmp_ptr    = packed_ptr;
+    convT* filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3];
+
+    // Reorder the output
+    if (kind == CONVOLVE_BATCH_KERNEL) {
+        reorderHelper<T, convT, roundOut>(out_ptr, out_dims, out_strides,
+                filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides,
+                filter_dims, sig_half_d0, baseDim, fftScale, expand);
+    } else {
+        reorderHelper<T, convT, roundOut>(out_ptr, out_dims, out_strides,
+                sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides,
+                filter_dims, sig_half_d0, baseDim, fftScale, expand);
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/gradient.hpp b/src/backend/cpu/kernel/gradient.hpp
new file mode 100644
index 0000000000..1ab01abb0f
--- /dev/null
+++ b/src/backend/cpu/kernel/gradient.hpp
@@ -0,0 +1,89 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void gradient(Array<T> grad0, Array<T> grad1, Array<T> const in)
+{
+    const af::dim4 dims = in.dims();
+
+    T *d_grad0    = grad0.get();
+    T *d_grad1    = grad1.get();
+    const T *d_in = in.get();
+
+    const af::dim4 inst = in.strides();
+    const af::dim4 g0st = grad0.strides();
+    const af::dim4 g1st = grad1.strides();
+
+    T v5 = scalar<T>(0.5);
+    T v1 = scalar<T>(1.0);
+
+    for(dim_t idw = 0; idw < dims[3]; idw++) {
+        const dim_t inW = idw * inst[3];
+        const dim_t g0W = idw * g0st[3];
+        const dim_t g1W = idw * g1st[3];
+        for(dim_t idz = 0; idz < dims[2]; idz++) {
+            const dim_t inZW = inW + idz * inst[2];
+            const dim_t g0ZW = g0W + idz * g0st[2];
+            const dim_t g1ZW = g1W + idz * g1st[2];
+            dim_t xl, xr, yl,yr;
+            T f0, f1;
+            for(dim_t idy = 0; idy < dims[1]; idy++) {
+                const dim_t inYZW = inZW + idy * inst[1];
+                const dim_t g0YZW = g0ZW + idy * g0st[1];
+                const dim_t g1YZW = g1ZW + idy * g1st[1];
+                if(idy == 0) {
+                    yl = inYZW + inst[1];
+                    yr = inYZW;
+                    f1 = v1;
+                } else if(idy == dims[1] - 1) {
+                    yl = inYZW;
+                    yr = inYZW - inst[1];
+                    f1 = v1;
+                } else {
+                    yl = inYZW + inst[1];
+                    yr = inYZW - inst[1];
+                    f1 = v5;
+                }
+                for(dim_t idx = 0; idx < dims[0]; idx++) {
+                    const dim_t inMem = inYZW + idx;
+                    const dim_t g0Mem = g0YZW + idx;
+                    const dim_t g1Mem = g1YZW + idx;
+                    if(idx == 0) {
+                        xl = inMem + 1;
+                        xr = inMem;
+                        f0 = v1;
+                    } else if(idx == dims[0] - 1) {
+                        xl = inMem;
+                        xr = inMem - 1;
+                        f0 = v1;
+                    } else {
+                        xl = inMem + 1;
+                        xr = inMem - 1;
+                        f0 = v5;
+                    }
+
+                    d_grad0[g0Mem] = f0 * (d_in[xl] - d_in[xr]);
+                    d_grad1[g1Mem] = f1 * (d_in[yl + idx] - d_in[yr + idx]);
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/harris.hpp b/src/backend/cpu/kernel/harris.hpp
new file mode 100644
index 0000000000..183cf37e77
--- /dev/null
+++ b/src/backend/cpu/kernel/harris.hpp
@@ -0,0 +1,124 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void second_order_deriv(Array<T> ixx, Array<T> ixy, Array<T> iyy,
+                        const unsigned in_len, const Array<T> ix, const Array<T> iy)
+{
+    T* ixx_out     = ixx.get();
+    T* ixy_out     = ixy.get();
+    T* iyy_out     = iyy.get();
+    const T* ix_in = ix.get();
+    const T* iy_in = iy.get();
+    for (unsigned x = 0; x < in_len; x++) {
+        ixx_out[x] = ix_in[x] * ix_in[x];
+        ixy_out[x] = ix_in[x] * iy_in[x];
+        iyy_out[x] = iy_in[x] * iy_in[x];
+    }
+}
+
+template<typename T>
+void harris_responses(Array<T> resp, const unsigned idim0, const unsigned idim1,
+                      const Array<T> ixx, const Array<T> ixy, const Array<T> iyy,
+                      const float k_thr, const unsigned border_len)
+{
+    T* resp_out      = resp.get();
+    const T* ixx_in  = ixx.get();
+    const T* ixy_in  = ixy.get();
+    const T* iyy_in  = iyy.get();
+    const unsigned r = border_len;
+
+    for (unsigned x = r; x < idim1 - r; x++) {
+        for (unsigned y = r; y < idim0 - r; y++) {
+            const unsigned idx = x * idim0 + y;
+
+            // Calculates matrix trace and determinant
+            T tr = ixx_in[idx] + iyy_in[idx];
+            T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx];
+
+            // Calculates local Harris response
+            resp_out[idx] = det - k_thr * (tr*tr);
+        }
+    }
+}
+
+template<typename T>
+void non_maximal(Array<float> xOut, Array<float> yOut, Array<float> respOut, unsigned* count,
+                 const unsigned idim0, const unsigned idim1, const Array<T> respIn,
+                 const float min_resp, const unsigned border_len, const unsigned max_corners)
+{
+    float* x_out = xOut.get();
+    float* y_out = yOut.get();
+    float* resp_out = respOut.get();
+    const T* resp_in = respIn.get();
+    // Responses on the border don't have 8-neighbors to compare, discard them
+    const unsigned r = border_len + 1;
+
+    for (unsigned x = r; x < idim1 - r; x++) {
+        for (unsigned y = r; y < idim0 - r; y++) {
+            const T v = resp_in[x * idim0 + y];
+
+            // Find maximum neighborhood response
+            T max_v;
+            max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]);
+            max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]);
+            max_v = max(max_v, resp_in[(x-1) * idim0 + y  ]);
+            max_v = max(max_v, resp_in[(x+1) * idim0 + y  ]);
+            max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]);
+            max_v = max(max_v, resp_in[(x)   * idim0 + y+1]);
+            max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]);
+
+            // Stores corner to {x,y,resp}_out if it's response is maximum compared
+            // to its 8-neighborhood and greater or equal minimum response
+            if (v > max_v && v >= (T)min_resp) {
+                const unsigned idx = *count;
+                *count += 1;
+                if (idx < max_corners) {
+                    x_out[idx]    = (float)x;
+                    y_out[idx]    = (float)y;
+                    resp_out[idx] = (float)v;
+                }
+            }
+        }
+    }
+}
+
+static void keep_corners(Array<float> xOut, Array<float> yOut, Array<float> respOut,
+                         const Array<float> xIn, const Array<float> yIn,
+                         const Array<float> respIn, const Array<unsigned> respIdx,
+                         const unsigned n_corners)
+{
+    float* x_out = xOut.get();
+    float* y_out = yOut.get();
+    float* resp_out = respOut.get();
+    const float* x_in = xIn.get();
+    const float* y_in = yIn.get();
+    const float* resp_in = respIn.get();
+    const uint* resp_idx = respIdx.get();
+
+    // Keep only the first n_feat features
+    for (unsigned f = 0; f < n_corners; f++) {
+        x_out[f] = x_in[resp_idx[f]];
+        y_out[f] = y_in[resp_idx[f]];
+        resp_out[f] = resp_in[f];
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/histogram.hpp b/src/backend/cpu/kernel/histogram.hpp
new file mode 100644
index 0000000000..9b9b897c02
--- /dev/null
+++ b/src/backend/cpu/kernel/histogram.hpp
@@ -0,0 +1,49 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename OutT, typename InT, bool IsLinear>
+void histogram(Array<OutT> out, Array<InT> const in,
+               unsigned const nbins, double const minval, double const maxval)
+{
+    dim4 const outDims   = out.dims();
+    float const step     = (maxval - minval)/(float)nbins;
+    dim4 const inDims    = in.dims();
+    dim4 const iStrides  = in.strides();
+    dim4 const oStrides  = out.strides();
+    dim_t const nElems   = inDims[0]*inDims[1];
+
+    OutT *outData    = out.get();
+    const InT* inData= in.get();
+
+    for(dim_t b3 = 0; b3 < outDims[3]; b3++) {
+        for(dim_t b2 = 0; b2 < outDims[2]; b2++) {
+            for(dim_t i=0; i<nElems; i++) {
+                int idx = IsLinear ? i : ((i % inDims[0]) + (i / inDims[0])*iStrides[1]);
+                int bin = (int)((inData[idx] - minval) / step);
+                bin = std::max(bin, 0);
+                bin = std::min(bin, (int)(nbins - 1));
+                outData[bin]++;
+            }
+            inData  += iStrides[2];
+            outData += oStrides[2];
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/hsv_rgb.hpp b/src/backend/cpu/kernel/hsv_rgb.hpp
new file mode 100644
index 0000000000..c1f59a1737
--- /dev/null
+++ b/src/backend/cpu/kernel/hsv_rgb.hpp
@@ -0,0 +1,124 @@
+/*******************************************************
+* Copyright (c) 2015, ArrayFire
+* All rights reserved.
+*
+* This file is distributed under 3-clause BSD license.
+* The complete license agreement can be obtained at:
+* http://arrayfire.com/licenses/BSD-3-Clause
+********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <cmath>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void hsv2rgb(Array<T> out, Array<T> const in)
+{
+    const af::dim4 dims    = in.dims();
+    const af::dim4 strides = in.strides();
+    dim_t obStride  = out.strides()[3];
+    dim_t coff      = strides[2];
+    dim_t bCount    = dims[3];
+
+    for(dim_t b=0; b<bCount; ++b) {
+        const T* src = in.get() + b * strides[3];
+        T* dst       = out.get() + b * obStride;
+
+        for(dim_t j=0; j<dims[1]; ++j) {
+            dim_t jOff = j*strides[1];
+            // j steps along 2nd dimension
+            for(dim_t i=0; i<dims[0]; ++i) {
+                // i steps along 1st dimension
+                dim_t hIdx = i*strides[0] + jOff;
+                dim_t sIdx = hIdx + coff;
+                dim_t vIdx = sIdx + coff;
+
+                T H = src[hIdx];
+                T S = src[sIdx];
+                T V = src[vIdx];
+
+                T R, G, B;
+                R = G = B = 0;
+
+                int   m = (int)(H * 6);
+                T f = H * 6 - m;
+                T p = V * (1 - S);
+                T q = V * (1 - f * S);
+                T t = V * (1 - (1 - f) * S);
+
+                switch (m % 6) {
+                    case 0: R = V, G = t, B = p; break;
+                    case 1: R = q, G = V, B = p; break;
+                    case 2: R = p, G = V, B = t; break;
+                    case 3: R = p, G = q, B = V; break;
+                    case 4: R = t, G = p, B = V; break;
+                    case 5: R = V, G = p, B = q; break;
+                }
+
+                dst[hIdx] = R;
+                dst[sIdx] = G;
+                dst[vIdx] = B;
+            }
+        }
+    }
+}
+
+template<typename T>
+void rgb2hsv(Array<T> out, Array<T> const in)
+{
+    const af::dim4 dims    = in.dims();
+    const af::dim4 strides = in.strides();
+    af::dim4 oStrides      = out.strides();
+    dim_t bCount    = dims[3];
+
+    for(dim_t b=0; b<bCount; ++b) {
+        const T* src = in.get() + b * strides[3];
+        T* dst       = out.get() + b * oStrides[3];
+
+        for(dim_t j=0; j<dims[1]; ++j) {
+            // j steps along 2nd dimension
+            dim_t oj = j * oStrides[1];
+            dim_t ij = j * strides[1];
+
+            for(dim_t i=0; i<dims[0]; ++i) {
+                // i steps along 1st dimension
+                dim_t oIdx0 = i * oStrides[0] + oj;
+                dim_t oIdx1 = oIdx0 + oStrides[2];
+                dim_t oIdx2 = oIdx1 + oStrides[2];
+
+                dim_t iIdx0 = i * strides[0]  + ij;
+                dim_t iIdx1 = iIdx0 + strides[2];
+                dim_t iIdx2 = iIdx1 + strides[2];
+
+                T R = src[iIdx0];
+                T G = src[iIdx1];
+                T B = src[iIdx2];
+                T Cmax = std::max(std::max(R, G), B);
+                T Cmin = std::min(std::min(R, G), B);
+                T delta= Cmax-Cmin;
+
+                T H = 0;
+
+                if (Cmax!=Cmin) {
+                    if (Cmax==R) H = (G-B)/delta + (G<B ? 6 : 0);
+                    if (Cmax==G) H = (B-R)/delta + 2;
+                    if (Cmax==B) H = (R-G)/delta + 4;
+                    H = H / 6.0f;
+                }
+
+                dst[oIdx0] = H;
+                dst[oIdx1] = (Cmax==0.0f ? 0 : delta/Cmax);
+                dst[oIdx2] = Cmax;
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/identity.hpp b/src/backend/cpu/kernel/identity.hpp
new file mode 100644
index 0000000000..242ba9dae3
--- /dev/null
+++ b/src/backend/cpu/kernel/identity.hpp
@@ -0,0 +1,37 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void identity(Array<T> out)
+{
+    T *ptr = out.get();
+    const af::dim4 out_dims  = out.dims();
+
+    for (dim_t k = 0; k < out_dims[2] * out_dims[3]; k++) {
+        for (dim_t j = 0; j < out_dims[1]; j++) {
+            for (dim_t i = 0; i < out_dims[0]; i++) {
+                ptr[j * out_dims[0] + i]  = (i == j) ? scalar<T>(1) : scalar<T>(0);
+            }
+        }
+        ptr += out_dims[0] * out_dims[1];
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/iir.hpp b/src/backend/cpu/kernel/iir.hpp
new file mode 100644
index 0000000000..5182094fc2
--- /dev/null
+++ b/src/backend/cpu/kernel/iir.hpp
@@ -0,0 +1,61 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void iir(Array<T> y, Array<T> c, Array<T> const a)
+{
+    dim4 ydims = c.dims();
+    int num_a = a.dims()[0];
+
+    for (int l = 0; l < (int)ydims[3]; l++) {
+        dim_t yidx3 = l * y.strides()[3];
+        dim_t cidx3 = l * c.strides()[3];
+        dim_t aidx3 = l * a.strides()[3];
+
+        for (int k = 0; k < (int)ydims[2]; k++) {
+
+            dim_t yidx2 = k * y.strides()[2] + yidx3;
+            dim_t cidx2 = k * c.strides()[2] + cidx3;
+            dim_t aidx2 = k * a.strides()[2] + aidx3;
+
+            for (int j = 0; j < (int)ydims[1]; j++) {
+
+                dim_t yidx1 = j * y.strides()[1] + yidx2;
+                dim_t cidx1 = j * c.strides()[1] + cidx2;
+                dim_t aidx1 = j * a.strides()[1] + aidx2;
+
+                std::vector<T> h_z(num_a);
+
+                const T *h_a = a.get() + (a.ndims() > 1 ? aidx1 : 0);
+                T *h_c = c.get() + cidx1;
+                T *h_y = y.get() + yidx1;
+
+                for (int i = 0; i < (int)ydims[0]; i++) {
+
+                    T y = h_y[i] = (h_c[i] + h_z[0]) /  h_a[0];
+                    for (int ii = 1; ii < num_a; ii++) {
+                        h_z[ii - 1] = h_z[ii] - h_a[ii] * y;
+                    }
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/index.hpp b/src/backend/cpu/kernel/index.hpp
new file mode 100644
index 0000000000..343d7ae4e7
--- /dev/null
+++ b/src/backend/cpu/kernel/index.hpp
@@ -0,0 +1,71 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <vector>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void index(Array<T> out, Array<T> const in,
+           std::vector<bool> const isSeq, std::vector<af_seq> const seqs,
+           std::vector< Array<uint> > const idxArrs)
+{
+    const af::dim4 iDims    = in.dims();
+    const af::dim4 dDims    = in.getDataDims();
+    const af::dim4 iOffs    = toOffset(seqs, dDims);
+    const af::dim4 iStrds   = toStride(seqs, dDims);
+    const af::dim4 oDims    = out.dims();
+    const af::dim4 oStrides = out.strides();
+    const T *src        = in.get();
+    T *dst        = out.get();
+    const uint* ptr0    = idxArrs[0].get();
+    const uint* ptr1    = idxArrs[1].get();
+    const uint* ptr2    = idxArrs[2].get();
+    const uint* ptr3    = idxArrs[3].get();
+
+    for (dim_t l=0; l<oDims[3]; ++l) {
+
+        dim_t lOff   = l*oStrides[3];
+        dim_t inIdx3 = trimIndex(isSeq[3] ? l+iOffs[3] : ptr3[l], iDims[3]);
+        dim_t inOff3 = inIdx3*iStrds[3];
+
+        for (dim_t k=0; k<oDims[2]; ++k) {
+
+            dim_t kOff   = k*oStrides[2];
+            dim_t inIdx2 = trimIndex(isSeq[2] ? k+iOffs[2] : ptr2[k], iDims[2]);
+            dim_t inOff2 = inIdx2*iStrds[2];
+
+            for (dim_t j=0; j<oDims[1]; ++j) {
+
+                dim_t jOff   = j*oStrides[1];
+                dim_t inIdx1 = trimIndex(isSeq[1] ? j+iOffs[1] : ptr1[j], iDims[1]);
+                dim_t inOff1 = inIdx1*iStrds[1];
+
+                for (dim_t i=0; i<oDims[0]; ++i) {
+
+                    dim_t iOff   = i*oStrides[0];
+                    dim_t inIdx0 = trimIndex(isSeq[0] ? i+iOffs[0] : ptr0[i], iDims[0]);
+                    dim_t inOff0 = inIdx0*iStrds[0];
+
+                    dst[lOff+kOff+jOff+iOff] = src[inOff3+inOff2+inOff1+inOff0];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/iota.hpp b/src/backend/cpu/kernel/iota.hpp
new file mode 100644
index 0000000000..0f824295a4
--- /dev/null
+++ b/src/backend/cpu/kernel/iota.hpp
@@ -0,0 +1,45 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void iota(Array<T> output, const af::dim4 &sdims, const af::dim4 &tdims)
+{
+    const af::dim4 dims    = output.dims();
+    T* out             = output.get();
+    const af::dim4 strides = output.strides();
+
+    for(dim_t w = 0; w < dims[3]; w++) {
+        dim_t offW = w * strides[3];
+        T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2];
+        for(dim_t z = 0; z < dims[2]; z++) {
+            dim_t offWZ = offW + z * strides[2];
+            T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1];
+            for(dim_t y = 0; y < dims[1]; y++) {
+                dim_t offWZY = offWZ + y * strides[1];
+                T valY = valZ + (y % sdims[1]) * sdims[0];
+                for(dim_t x = 0; x < dims[0]; x++) {
+                    dim_t id = offWZY + x;
+                    out[id] = valY + (x % sdims[0]);
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/ireduce.hpp b/src/backend/cpu/kernel/ireduce.hpp
new file mode 100644
index 0000000000..848885515b
--- /dev/null
+++ b/src/backend/cpu/kernel/ireduce.hpp
@@ -0,0 +1,108 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T> double cabs(const T in) { return (double)in; }
+static double cabs(const char in) { return (double)(in > 0); }
+static double cabs(const cfloat &in) { return (double)abs(in); }
+static double cabs(const cdouble &in) { return (double)abs(in); }
+
+template<af_op_t op, typename T>
+struct MinMaxOp
+{
+    T m_val;
+    uint m_idx;
+    MinMaxOp(T val, uint idx) :
+        m_val(val), m_idx(idx)
+    {
+    }
+
+    void operator()(T val, uint idx)
+    {
+        if (cabs(val) < cabs(m_val) ||
+            (cabs(val) == cabs(m_val) &&
+             idx > m_idx)) {
+            m_val = val;
+            m_idx = idx;
+        }
+    }
+};
+
+template<typename T>
+struct MinMaxOp<af_max_t, T>
+{
+    T m_val;
+    uint m_idx;
+    MinMaxOp(T val, uint idx) :
+        m_val(val), m_idx(idx)
+    {
+    }
+
+    void operator()(T val, uint idx)
+    {
+        if (cabs(val) > cabs(m_val) ||
+            (cabs(val) == cabs(m_val) &&
+             idx <= m_idx)) {
+            m_val = val;
+            m_idx = idx;
+        }
+    }
+};
+
+template<af_op_t op, typename T, int D>
+struct ireduce_dim
+{
+    void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+                    const Array<T> input, const dim_t inOffset, const int dim)
+    {
+        const af::dim4 odims    = output.dims();
+        const af::dim4 ostrides = output.strides();
+        const af::dim4 istrides = input.strides();
+        const int D1 = D - 1;
+        for (dim_t i = 0; i < odims[D1]; i++) {
+            ireduce_dim<op, T, D1>()(output, locArray, outOffset + i * ostrides[D1],
+                                     input, inOffset + i * istrides[D1], dim);
+        }
+    }
+};
+
+template<af_op_t op, typename T>
+struct ireduce_dim<op, T, 0>
+{
+    void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+                    const Array<T> input, const dim_t inOffset, const int dim)
+    {
+        const af::dim4 idims = input.dims();
+        const af::dim4 istrides = input.strides();
+
+        T const * const in = input.get();
+        T * out = output.get();
+        uint * loc = locArray.get();
+
+        dim_t stride = istrides[dim];
+        MinMaxOp<op, T> Op(in[inOffset], 0);
+        for (dim_t i = 0; i < idims[dim]; i++) {
+            Op(in[inOffset + i * stride], i);
+        }
+
+        out[outOffset] = Op.m_val;
+        loc[outOffset] = Op.m_idx;
+    }
+};
+
+}
+}
diff --git a/src/backend/cpu/kernel/join.hpp b/src/backend/cpu/kernel/join.hpp
new file mode 100644
index 0000000000..b0d92c9978
--- /dev/null
+++ b/src/backend/cpu/kernel/join.hpp
@@ -0,0 +1,144 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<int dim>
+af::dim4 calcOffset(const af::dim4 dims)
+{
+    af::dim4 offset;
+    offset[0] = (dim == 0) ? dims[0] : 0;
+    offset[1] = (dim == 1) ? dims[1] : 0;
+    offset[2] = (dim == 2) ? dims[2] : 0;
+    offset[3] = (dim == 3) ? dims[3] : 0;
+    return offset;
+}
+
+template<typename To, typename Tx, int dim>
+void join_append(To *out, const Tx *X, const af::dim4 &offset,
+           const af::dim4 &odims, const af::dim4 &xdims,
+           const af::dim4 &ost, const af::dim4 &xst)
+{
+    for(dim_t ow = 0; ow < xdims[3]; ow++) {
+        const dim_t xW = ow * xst[3];
+        const dim_t oW = (ow + offset[3]) * ost[3];
+
+        for(dim_t oz = 0; oz < xdims[2]; oz++) {
+            const dim_t xZW = xW + oz * xst[2];
+            const dim_t oZW = oW + (oz + offset[2]) * ost[2];
+
+            for(dim_t oy = 0; oy < xdims[1]; oy++) {
+                const dim_t xYZW = xZW + oy * xst[1];
+                const dim_t oYZW = oZW + (oy + offset[1]) * ost[1];
+
+                for(dim_t ox = 0; ox < xdims[0]; ox++) {
+                    const dim_t iMem = xYZW + ox;
+                    const dim_t oMem = oYZW + (ox + offset[0]);
+                    out[oMem] = X[iMem];
+                }
+            }
+        }
+    }
+}
+
+template<typename Tx, typename Ty>
+void join(Array<Tx> out, const int dim, const Array<Tx> first, const Array<Ty> second)
+{
+    Tx* outPtr = out.get();
+    const Tx* fptr = first.get();
+    const Ty* sptr = second.get();
+
+    af::dim4 zero(0,0,0,0);
+    const af::dim4 odims = out.dims();
+    const af::dim4 fdims = first.dims();
+    const af::dim4 sdims = second.dims();
+
+    switch(dim) {
+        case 0:
+            join_append<Tx, Tx, 0>(outPtr, fptr, zero,
+                    odims, fdims, out.strides(), first.strides());
+            join_append<Tx, Ty, 0>(outPtr, sptr, calcOffset<0>(fdims),
+                    odims, sdims, out.strides(), second.strides());
+            break;
+        case 1:
+            join_append<Tx, Tx, 1>(outPtr, fptr, zero,
+                    odims, fdims, out.strides(), first.strides());
+            join_append<Tx, Ty, 1>(outPtr, sptr, calcOffset<1>(fdims),
+                    odims, sdims, out.strides(), second.strides());
+            break;
+        case 2:
+            join_append<Tx, Tx, 2>(outPtr, fptr, zero,
+                    odims, fdims, out.strides(), first.strides());
+            join_append<Tx, Ty, 2>(outPtr, sptr, calcOffset<2>(fdims),
+                    odims, sdims, out.strides(), second.strides());
+            break;
+        case 3:
+            join_append<Tx, Tx, 3>(outPtr, fptr, zero,
+                    odims, fdims, out.strides(), first.strides());
+            join_append<Tx, Ty, 3>(outPtr, sptr, calcOffset<3>(fdims),
+                    odims, sdims, out.strides(), second.strides());
+            break;
+    }
+}
+
+template<typename T, int n_arrays>
+void join(const int dim, Array<T> out, const std::vector<Array<T>> inputs)
+{
+    af::dim4 zero(0,0,0,0);
+    af::dim4 d = zero;
+    switch(dim) {
+        case 0:
+            join_append<T, T, 0>(out.get(), inputs[0].get(), zero,
+                        out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
+            for(int i = 1; i < n_arrays; i++) {
+                d += inputs[i - 1].dims();
+                join_append<T, T, 0>(out.get(), inputs[i].get(), calcOffset<0>(d),
+                        out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
+            }
+            break;
+        case 1:
+            join_append<T, T, 1>(out.get(), inputs[0].get(), zero,
+                        out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
+            for(int i = 1; i < n_arrays; i++) {
+                d += inputs[i - 1].dims();
+                join_append<T, T, 1>(out.get(), inputs[i].get(), calcOffset<1>(d),
+                        out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
+            }
+            break;
+        case 2:
+            join_append<T, T, 2>(out.get(), inputs[0].get(), zero,
+                        out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
+            for(int i = 1; i < n_arrays; i++) {
+                d += inputs[i - 1].dims();
+                join_append<T, T, 2>(out.get(), inputs[i].get(), calcOffset<2>(d),
+                        out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
+            }
+            break;
+        case 3:
+            join_append<T, T, 3>(out.get(), inputs[0].get(), zero,
+                        out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
+            for(int i = 1; i < n_arrays; i++) {
+                d += inputs[i - 1].dims();
+                join_append<T, T, 3>(out.get(), inputs[i].get(), calcOffset<3>(d),
+                        out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
+            }
+            break;
+    }
+}
+
+}
+}
+
diff --git a/src/backend/cpu/kernel/lookup.hpp b/src/backend/cpu/kernel/lookup.hpp
new file mode 100644
index 0000000000..a290ef2fca
--- /dev/null
+++ b/src/backend/cpu/kernel/lookup.hpp
@@ -0,0 +1,62 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <vector>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename InT, typename IndexT>
+void lookup(Array<InT> out, Array<InT> const input,
+            Array<IndexT> const indices, unsigned const dim)
+{
+    const af::dim4 iDims    = input.dims();
+    const af::dim4 oDims    = out.dims();
+    const af::dim4 iStrides = input.strides();
+    const af::dim4 oStrides = out.strides();
+    const InT *inPtr   = input.get();
+    const IndexT *idxPtr = indices.get();
+
+    InT *outPtr = out.get();
+
+    for (dim_t l=0; l<oDims[3]; ++l) {
+
+        dim_t iLOff = iStrides[3]*(dim==3 ? trimIndex((dim_t)idxPtr[l], iDims[3]): l);
+        dim_t oLOff = l*oStrides[3];
+
+        for (dim_t k=0; k<oDims[2]; ++k) {
+
+            dim_t iKOff = iStrides[2]*(dim==2 ? trimIndex((dim_t)idxPtr[k], iDims[2]): k);
+            dim_t oKOff = k*oStrides[2];
+
+            for (dim_t j=0; j<oDims[1]; ++j) {
+
+                dim_t iJOff = iStrides[1]*(dim==1 ? trimIndex((dim_t)idxPtr[j], iDims[1]): j);
+                dim_t oJOff = j*oStrides[1];
+
+                for (dim_t i=0; i<oDims[0]; ++i) {
+
+                    dim_t iIOff = iStrides[0]*(dim==0 ? trimIndex((dim_t)idxPtr[i], iDims[0]): i);
+                    dim_t oIOff = i*oStrides[0];
+
+                    outPtr[oLOff+oKOff+oJOff+oIOff] = inPtr[iLOff+iKOff+iJOff+iIOff];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/lu.hpp b/src/backend/cpu/kernel/lu.hpp
new file mode 100644
index 0000000000..35b0c19b84
--- /dev/null
+++ b/src/backend/cpu/kernel/lu.hpp
@@ -0,0 +1,80 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void lu_split(Array<T> lower, Array<T> upper, const Array<T> in)
+{
+    T *l = lower.get();
+    T *u = upper.get();
+    const T *i = in.get();
+
+    af::dim4 ldm = lower.dims();
+    af::dim4 udm = upper.dims();
+    af::dim4 idm = in.dims();
+    af::dim4 lst = lower.strides();
+    af::dim4 ust = upper.strides();
+    af::dim4 ist = in.strides();
+
+    for(dim_t ow = 0; ow < idm[3]; ow++) {
+        const dim_t lW = ow * lst[3];
+        const dim_t uW = ow * ust[3];
+        const dim_t iW = ow * ist[3];
+
+        for(dim_t oz = 0; oz < idm[2]; oz++) {
+            const dim_t lZW = lW + oz * lst[2];
+            const dim_t uZW = uW + oz * ust[2];
+            const dim_t iZW = iW + oz * ist[2];
+
+            for(dim_t oy = 0; oy < idm[1]; oy++) {
+                const dim_t lYZW = lZW + oy * lst[1];
+                const dim_t uYZW = uZW + oy * ust[1];
+                const dim_t iYZW = iZW + oy * ist[1];
+
+                for(dim_t ox = 0; ox < idm[0]; ox++) {
+                    const dim_t lMem = lYZW + ox;
+                    const dim_t uMem = uYZW + ox;
+                    const dim_t iMem = iYZW + ox;
+                    if(ox > oy) {
+                        if(oy < ldm[1]) l[lMem] = i[iMem];
+                        if(ox < udm[0]) u[uMem] = scalar<T>(0);
+                    } else if (oy > ox) {
+                        if(oy < ldm[1]) l[lMem] = scalar<T>(0);
+                        if(ox < udm[0]) u[uMem] = i[iMem];
+                    } else if(ox == oy) {
+                        if(oy < ldm[1]) l[lMem] = scalar<T>(1.0);
+                        if(ox < udm[0]) u[uMem] = i[iMem];
+                    }
+                }
+            }
+        }
+    }
+}
+
+void convertPivot(Array<int> p, Array<int> pivot)
+{
+    int *d_pi = pivot.get();
+    int *d_po = p.get();
+    dim_t d0  = pivot.dims()[0];
+    for(int j = 0; j < (int)d0; j++) {
+        // 1 indexed in pivot
+        std::swap(d_po[j], d_po[d_pi[j] - 1]);
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/match_template.hpp b/src/backend/cpu/kernel/match_template.hpp
new file mode 100644
index 0000000000..ae41364018
--- /dev/null
+++ b/src/backend/cpu/kernel/match_template.hpp
@@ -0,0 +1,141 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename OutT, typename InT, af_match_type MatchT>
+void matchTemplate(Array<OutT> out, const Array<InT> sImg, const Array<InT> tImg)
+{
+    const af::dim4 sDims = sImg.dims();
+    const af::dim4 tDims = tImg.dims();
+    const af::dim4 sStrides = sImg.strides();
+    const af::dim4 tStrides = tImg.strides();
+
+    const dim_t tDim0  = tDims[0];
+    const dim_t tDim1  = tDims[1];
+    const dim_t sDim0  = sDims[0];
+    const dim_t sDim1  = sDims[1];
+
+    const af::dim4 oStrides = out.strides();
+
+    OutT tImgMean = OutT(0);
+    dim_t winNumElements = tImg.elements();
+    bool needMean = MatchT==AF_ZSAD || MatchT==AF_LSAD ||
+        MatchT==AF_ZSSD || MatchT==AF_LSSD ||
+        MatchT==AF_ZNCC;
+    const InT * tpl = tImg.get();
+
+    if (needMean) {
+        for(dim_t tj=0; tj<tDim1; tj++) {
+            dim_t tjStride = tj*tStrides[1];
+
+            for(dim_t ti=0; ti<tDim0; ti++) {
+                tImgMean += (OutT)tpl[tjStride+ti*tStrides[0]];
+            }
+        }
+        tImgMean /= winNumElements;
+    }
+
+    OutT * dst      = out.get();
+    const InT * src = sImg.get();
+
+    for(dim_t b3=0; b3<sDims[3]; ++b3) {
+        for(dim_t b2=0; b2<sDims[2]; ++b2) {
+
+            // slide through image window after window
+            for(dim_t sj=0; sj<sDim1; sj++) {
+
+                dim_t ojStride = sj*oStrides[1];
+
+                for(dim_t si=0; si<sDim0; si++) {
+                    OutT disparity = OutT(0);
+
+                    // mean for window
+                    // this variable will be used based on MatchT value
+                    OutT wImgMean = OutT(0);
+                    if (needMean) {
+                        for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) {
+                            dim_t jStride = j*sStrides[1];
+
+                            for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) {
+                                InT sVal = ((j<sDim1 && i<sDim0) ?
+                                        src[jStride + i*sStrides[0]] : InT(0));
+                                wImgMean += (OutT)sVal;
+                            }
+                        }
+                        wImgMean /= winNumElements;
+                    }
+
+                    // run the window match metric
+                    for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) {
+                        dim_t jStride = j*sStrides[1];
+                        dim_t tjStride = tj*tStrides[1];
+
+                        for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) {
+                            InT sVal = ((j<sDim1 && i<sDim0) ?
+                                    src[jStride + i*sStrides[0]] : InT(0));
+                            InT tVal = tpl[tjStride+ti*tStrides[0]];
+                            OutT temp;
+                            switch(MatchT) {
+                                case AF_SAD:
+                                    disparity += fabs((OutT)sVal-(OutT)tVal);
+                                    break;
+                                case AF_ZSAD:
+                                    disparity += fabs((OutT)sVal - wImgMean -
+                                            (OutT)tVal + tImgMean);
+                                    break;
+                                case AF_LSAD:
+                                    disparity += fabs((OutT)sVal-(wImgMean/tImgMean)*tVal);
+                                    break;
+                                case AF_SSD:
+                                    disparity += ((OutT)sVal-(OutT)tVal)*((OutT)sVal-(OutT)tVal);
+                                    break;
+                                case AF_ZSSD:
+                                    temp = ((OutT)sVal - wImgMean - (OutT)tVal + tImgMean);
+                                    disparity += temp*temp;
+                                    break;
+                                case AF_LSSD:
+                                    temp = ((OutT)sVal-(wImgMean/tImgMean)*tVal);
+                                    disparity += temp*temp;
+                                    break;
+                                case AF_NCC:
+                                    //TODO: furture implementation
+                                    break;
+                                case AF_ZNCC:
+                                    //TODO: furture implementation
+                                    break;
+                                case AF_SHD:
+                                    //TODO: furture implementation
+                                    break;
+                            }
+                        }
+                    }
+                    // output is just created, hence not doing the
+                    // extra multiplication for 0th dim stride
+                    dst[ojStride + si] = disparity;
+                }
+            }
+            src += sStrides[2];
+            dst += oStrides[2];
+        }
+        src += sStrides[3];
+        dst += oStrides[3];
+    }
+};
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/meanshift.hpp b/src/backend/cpu/kernel/meanshift.hpp
new file mode 100644
index 0000000000..54fb1a89bf
--- /dev/null
+++ b/src/backend/cpu/kernel/meanshift.hpp
@@ -0,0 +1,135 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <vector>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, bool IsColor>
+void meanShift(Array<T> out, const Array<T> in, const float s_sigma,
+               const float c_sigma, const unsigned iter)
+{
+    const af::dim4 dims     = in.dims();
+    const af::dim4 istrides = in.strides();
+    const af::dim4 ostrides = out.strides();
+
+    const dim_t bCount   = (IsColor ? 1 : dims[2]);
+    const dim_t channels = (IsColor ? dims[2] : 1);
+
+    // clamp spatical and chromatic sigma's
+    float space_          = std::min(11.5f, s_sigma);
+    const dim_t radius = std::max((int)(space_ * 1.5f), 1);
+    const float cvar      = c_sigma*c_sigma;
+
+    std::vector<float> means(channels);
+    std::vector<float> centers(channels);
+    std::vector<float> tmpclrs(channels);
+
+    T *outData       = out.get();
+    const T * inData = in.get();
+
+    for(dim_t b3=0; b3<dims[3]; ++b3) {
+        for(dim_t b2=0; b2<bCount; ++b2) {
+
+            for(dim_t j=0; j<dims[1]; ++j) {
+
+                dim_t j_in_off  = j*istrides[1];
+                dim_t j_out_off = j*ostrides[1];
+
+                for(dim_t i=0; i<dims[0]; ++i) {
+
+                    dim_t i_in_off  = i*istrides[0];
+                    dim_t i_out_off = i*ostrides[0];
+
+                    // clear means and centers for this pixel
+                    for(dim_t ch=0; ch<channels; ++ch) {
+                        means[ch] = 0.0f;
+                        // the expression ch*istrides[2] will only effect when ch>1
+                        // i.e for color images where batch is along fourth dimension
+                        centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]];
+                    }
+
+                    // scope of meanshift iterationd begin
+                    for(unsigned it=0; it<iter; ++it) {
+
+                        int count   = 0;
+                        int shift_x = 0;
+                        int shift_y = 0;
+
+                        for(dim_t wj=-radius; wj<=radius; ++wj) {
+
+                            int hit_count = 0;
+
+                            for(dim_t wi=-radius; wi<=radius; ++wi) {
+
+                                dim_t tj = j + wj;
+                                dim_t ti = i + wi;
+
+                                // clamps offsets
+                                tj = clamp(tj, 0ll, dims[1]-1);
+                                ti = clamp(ti, 0ll, dims[0]-1);
+
+                                // proceed
+                                float norm = 0.0f;
+                                for(dim_t ch=0; ch<channels; ++ch) {
+                                    tmpclrs[ch] = inData[ tj*istrides[1] + ti*istrides[0] + ch*istrides[2]];
+                                    norm += (centers[ch]-tmpclrs[ch]) * (centers[ch]-tmpclrs[ch]);
+                                }
+
+                                if (norm<= cvar) {
+                                    for(dim_t ch=0; ch<channels; ++ch)
+                                        means[ch] += tmpclrs[ch];
+                                    shift_x += wi;
+                                    ++hit_count;
+                                }
+
+                            }
+                            count+= hit_count;
+                            shift_y += wj*hit_count;
+                        }
+
+                        if (count==0) { break; }
+
+                        const float fcount = 1.f/count;
+                        const int mean_x = (int)(shift_x*fcount+0.5f);
+                        const int mean_y = (int)(shift_y*fcount+0.5f);
+                        for(dim_t ch=0; ch<channels; ++ch)
+                            means[ch] *= fcount;
+
+                        float norm = 0.f;
+                        for(dim_t ch=0; ch<channels; ++ch)
+                            norm += ((means[ch]-centers[ch])*(means[ch]-centers[ch]));
+                        bool stop = ((abs(shift_y-mean_y)+abs(shift_x-mean_x)) + norm) <= 1;
+                        shift_x = mean_x;
+                        shift_y = mean_y;
+                        for(dim_t ch=0; ch<channels; ++ch)
+                            centers[ch] = means[ch];
+                        if (stop) { break; }
+                    } // scope of meanshift iterations end
+
+                    for(dim_t ch=0; ch<channels; ++ch)
+                        outData[j_out_off + i_out_off + ch*ostrides[2]] = centers[ch];
+
+                }
+            }
+            outData += ostrides[2];
+            inData  += istrides[2];
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/medfilt.hpp b/src/backend/cpu/kernel/medfilt.hpp
new file mode 100644
index 0000000000..bc639a89b5
--- /dev/null
+++ b/src/backend/cpu/kernel/medfilt.hpp
@@ -0,0 +1,135 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <vector>
+#include <algorithm>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, af_border_type Pad>
+void medfilt(Array<T> out, const Array<T> in, dim_t w_len, dim_t w_wid)
+{
+    const af::dim4 dims     = in.dims();
+    const af::dim4 istrides = in.strides();
+    const af::dim4 ostrides = out.strides();
+
+    std::vector<T> wind_vals;
+    wind_vals.reserve(w_len*w_wid);
+
+    T const * in_ptr = in.get();
+    T * out_ptr = out.get();
+
+    for(int b3=0; b3<(int)dims[3]; b3++) {
+
+        for(int b2=0; b2<(int)dims[2]; b2++) {
+
+            for(int col=0; col<(int)dims[1]; col++) {
+
+                int ocol_off = col*ostrides[1];
+
+                for(int row=0; row<(int)dims[0]; row++) {
+
+                    wind_vals.clear();
+
+                    for(int wj=0; wj<(int)w_wid; ++wj) {
+
+                        bool isColOff = false;
+
+                        int im_col = col + wj-w_wid/2;
+                        int im_coff;
+                        switch(Pad) {
+                            case AF_PAD_ZERO:
+                                im_coff = im_col * istrides[1];
+                                if (im_col < 0 || im_col>=(int)dims[1])
+                                    isColOff = true;
+                                break;
+                            case AF_PAD_SYM:
+                                {
+                                    if (im_col < 0) {
+                                        im_col *= -1;
+                                        isColOff = true;
+                                    }
+
+                                    if (im_col>=(int)dims[1]) {
+                                        im_col = 2*((int)dims[1]-1) - im_col;
+                                        isColOff = true;
+                                    }
+
+                                    im_coff = im_col * istrides[1];
+                                }
+                                break;
+                        }
+
+                        for(int wi=0; wi<(int)w_len; ++wi) {
+
+                            bool isRowOff = false;
+
+                            int im_row = row + wi-w_len/2;
+                            int im_roff;
+                            switch(Pad) {
+                                case AF_PAD_ZERO:
+                                    im_roff = im_row * istrides[0];
+                                    if (im_row < 0 || im_row>=(int)dims[0])
+                                        isRowOff = true;
+                                    break;
+                                case AF_PAD_SYM:
+                                    {
+                                        if (im_row < 0) {
+                                            im_row *= -1;
+                                            isRowOff = true;
+                                        }
+
+                                        if (im_row>=(int)dims[0]) {
+                                            im_row = 2*((int)dims[0]-1) - im_row;
+                                            isRowOff = true;
+                                        }
+
+                                        im_roff = im_row * istrides[0];
+                                    }
+                                    break;
+                            }
+
+                            if(isRowOff || isColOff) {
+                                switch(Pad) {
+                                    case AF_PAD_ZERO:
+                                        wind_vals.push_back(0);
+                                        break;
+                                    case AF_PAD_SYM:
+                                        wind_vals.push_back(in_ptr[im_coff+im_roff]);
+                                        break;
+                                }
+                            } else
+                                wind_vals.push_back(in_ptr[im_coff+im_roff]);
+                        }
+                    }
+
+                    std::stable_sort(wind_vals.begin(),wind_vals.end());
+                    int off = wind_vals.size()/2;
+                    if (wind_vals.size()%2==0)
+                        out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2;
+                    else {
+                        out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off];
+                    }
+                }
+            }
+            in_ptr  += istrides[2];
+            out_ptr += ostrides[2];
+        }
+    }
+}
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/morph.hpp b/src/backend/cpu/kernel/morph.hpp
new file mode 100644
index 0000000000..af9b7e9373
--- /dev/null
+++ b/src/backend/cpu/kernel/morph.hpp
@@ -0,0 +1,140 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, bool IsDilation>
+void morph(Array<T> out, Array<T> const in, Array<T> const mask)
+{
+    const af::dim4 ostrides = out.strides();
+    const af::dim4 istrides = in.strides();
+    const af::dim4 fstrides = mask.strides();
+    const af::dim4 dims     = in.dims();
+    const af::dim4 window   = mask.dims();
+    T* outData          = out.get();
+    const T*   inData   = in.get();
+    const T*   filter   = mask.get();
+    const dim_t R0      = window[0]/2;
+    const dim_t R1      = window[1]/2;
+
+    for(dim_t b3=0; b3<dims[3]; ++b3) {
+        for(dim_t b2=0; b2<dims[2]; ++b2) {
+            // either channels or batch is handled by outer most loop
+            for(dim_t j=0; j<dims[1]; ++j) {
+                // j steps along 2nd dimension
+                for(dim_t i=0; i<dims[0]; ++i) {
+                    // i steps along 1st dimension
+                    T filterResult = inData[ getIdx(istrides, i, j) ];
+
+                    // wj,wi steps along 2nd & 1st dimensions of filter window respectively
+                    for(dim_t wj=0; wj<window[1]; wj++) {
+                        for(dim_t wi=0; wi<window[0]; wi++) {
+
+                            dim_t offj = j+wj-R1;
+                            dim_t offi = i+wi-R0;
+
+                            T maskValue = filter[ getIdx(fstrides, wi, wj) ];
+
+                            if ((maskValue > (T)0) && offi>=0 && offj>=0 && offi<dims[0] && offj<dims[1]) {
+
+                                T inValue   = inData[ getIdx(istrides, offi, offj) ];
+
+                                if (IsDilation)
+                                    filterResult = std::max(filterResult, inValue);
+                                else
+                                    filterResult = std::min(filterResult, inValue);
+                            }
+
+                        } // window 1st dimension loop ends here
+                    } // filter window loop ends here
+
+                    outData[ getIdx(ostrides, i, j) ] = filterResult;
+                } //1st dimension loop ends here
+            } // 2nd dimension loop ends here
+
+            // next iteration will be next batch if any
+            outData += ostrides[2];
+            inData  += istrides[2];
+        }
+    }
+}
+
+template<typename T, bool IsDilation>
+void morph3d(Array<T> out, Array<T> const in, Array<T> const mask)
+{
+    const af::dim4 dims     = in.dims();
+    const af::dim4 window   = mask.dims();
+    const dim_t R0      = window[0]/2;
+    const dim_t R1      = window[1]/2;
+    const dim_t R2      = window[2]/2;
+    const af::dim4 istrides = in.strides();
+    const af::dim4 fstrides = mask.strides();
+    const dim_t bCount  = dims[3];
+    const af::dim4 ostrides = out.strides();
+    T* outData          = out.get();
+    const T*   inData   = in.get();
+    const T*   filter   = mask.get();
+
+    for(dim_t batchId=0; batchId<bCount; ++batchId) {
+        // either channels or batch is handled by outer most loop
+        for(dim_t k=0; k<dims[2]; ++k) {
+            // k steps along 3rd dimension
+            for(dim_t j=0; j<dims[1]; ++j) {
+                // j steps along 2nd dimension
+                for(dim_t i=0; i<dims[0]; ++i) {
+                    // i steps along 1st dimension
+                    T filterResult = inData[ getIdx(istrides, i, j, k) ];
+
+                    // wk, wj,wi steps along 2nd & 1st dimensions of filter window respectively
+                    for(dim_t wk=0; wk<window[2]; wk++) {
+                        for(dim_t wj=0; wj<window[1]; wj++) {
+                            for(dim_t wi=0; wi<window[0]; wi++) {
+
+                                dim_t offk = k+wk-R2;
+                                dim_t offj = j+wj-R1;
+                                dim_t offi = i+wi-R0;
+
+                                T maskValue = filter[ getIdx(fstrides, wi, wj, wk) ];
+
+                                if ((maskValue > (T)0) && offi>=0 && offj>=0 && offk>=0 &&
+                                        offi<dims[0] && offj<dims[1] && offk<dims[2]) {
+
+                                    T inValue   = inData[ getIdx(istrides, offi, offj, offk) ];
+
+                                    if (IsDilation)
+                                        filterResult = std::max(filterResult, inValue);
+                                    else
+                                        filterResult = std::min(filterResult, inValue);
+                                }
+
+                            } // window 1st dimension loop ends here
+                        }  // window 1st dimension loop ends here
+                    }// filter window loop ends here
+
+                    outData[ getIdx(ostrides, i, j, k) ] = filterResult;
+                } //1st dimension loop ends here
+            } // 2nd dimension loop ends here
+        } // 3rd dimension loop ends here
+        // next iteration will be next batch if any
+        outData += ostrides[3];
+        inData  += istrides[3];
+    }
+}
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/nearest_neighbour.hpp b/src/backend/cpu/kernel/nearest_neighbour.hpp
new file mode 100644
index 0000000000..4916463aed
--- /dev/null
+++ b/src/backend/cpu/kernel/nearest_neighbour.hpp
@@ -0,0 +1,143 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+#if defined(_WIN32) || defined(_MSC_VER)
+
+#include <intrin.h>
+#define __builtin_popcount __popcnt
+
+#endif
+
+template<typename T, typename To, af_match_type dist_type>
+struct dist_op
+{
+    To operator()(T v1, T v2)
+    {
+        return v1 - v2;     // Garbage distance
+    }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SAD>
+{
+    To operator()(T v1, T v2)
+    {
+        return std::abs((double)v1 - (double)v2);
+    }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SSD>
+{
+    To operator()(T v1, T v2)
+    {
+        return (v1 - v2) * (v1 - v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uint, To, AF_SHD>
+{
+    To operator()(uint v1, uint v2)
+    {
+        return __builtin_popcount(v1 ^ v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uintl, To, AF_SHD>
+{
+    To operator()(uintl v1, uintl v2)
+    {
+        return __builtin_popcount(v1 ^ v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uchar, To, AF_SHD>
+{
+    To operator()(uchar v1, uchar v2)
+    {
+        return __builtin_popcount(v1 ^ v2);
+    }
+};
+
+template<typename To>
+struct dist_op<ushort, To, AF_SHD>
+{
+    To operator()(ushort v1, ushort v2)
+    {
+        return __builtin_popcount(v1 ^ v2);
+    }
+};
+
+template<typename T, typename To, af_match_type dist_type>
+void nearest_neighbour(Array<uint> idx, Array<To> dist,
+                       const Array<T> query, const Array<T> train,
+                       const uint dist_dim, const uint n_dist)
+{
+    uint sample_dim = (dist_dim == 0) ? 1 : 0;
+    const dim4 qDims = query.dims();
+    const dim4 tDims = train.dims();
+
+    const unsigned distLength = qDims[dist_dim];
+    const unsigned nQuery = qDims[sample_dim];
+    const unsigned nTrain = tDims[sample_dim];
+
+    const T* qPtr = query.get();
+    const T* tPtr = train.get();
+    uint* iPtr = idx.get();
+    To* dPtr = dist.get();
+
+    dist_op<T, To, dist_type> op;
+
+    for (unsigned i = 0; i < nQuery; i++) {
+        To best_dist = limit_max<To>();
+        unsigned best_idx  = 0;
+
+        for (unsigned j = 0; j < nTrain; j++) {
+            To local_dist = 0;
+            for (unsigned k = 0; k < distLength; k++) {
+                size_t qIdx, tIdx;
+                if (sample_dim == 0) {
+                    qIdx = k * qDims[0] + i;
+                    tIdx = k * tDims[0] + j;
+                }
+                else {
+                    qIdx = i * qDims[0] + k;
+                    tIdx = j * tDims[0] + k;
+                }
+
+                local_dist += op(qPtr[qIdx], tPtr[tIdx]);
+            }
+
+            if (local_dist < best_dist) {
+                best_dist = local_dist;
+                best_idx  = j;
+            }
+        }
+
+        size_t oIdx;
+        oIdx = i;
+        iPtr[oIdx] = best_idx;
+        dPtr[oIdx] = best_dist;
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/orb.hpp b/src/backend/cpu/kernel/orb.hpp
new file mode 100644
index 0000000000..acd508cb70
--- /dev/null
+++ b/src/backend/cpu/kernel/orb.hpp
@@ -0,0 +1,509 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+// Reference pattern, generated for a patch size of 31x31, as suggested by
+// original ORB paper
+#define REF_PAT_SIZE 31
+#define REF_PAT_SAMPLES 256
+#define REF_PAT_COORDS 4
+#define REF_PAT_LENGTH (REF_PAT_SAMPLES*REF_PAT_COORDS)
+
+// Current reference pattern was borrowed from OpenCV, to build a pattern with
+// similar quality, a training process must be applied, as described in
+// sections 4.2 and 4.3 of the original ORB paper.
+const int ref_pat[REF_PAT_LENGTH] = {
+    8,-3, 9,5,
+    4,2, 7,-12,
+    -11,9, -8,2,
+    7,-12, 12,-13,
+    2,-13, 2,12,
+    1,-7, 1,6,
+    -2,-10, -2,-4,
+    -13,-13, -11,-8,
+    -13,-3, -12,-9,
+    10,4, 11,9,
+    -13,-8, -8,-9,
+    -11,7, -9,12,
+    7,7, 12,6,
+    -4,-5, -3,0,
+    -13,2, -12,-3,
+    -9,0, -7,5,
+    12,-6, 12,-1,
+    -3,6, -2,12,
+    -6,-13, -4,-8,
+    11,-13, 12,-8,
+    4,7, 5,1,
+    5,-3, 10,-3,
+    3,-7, 6,12,
+    -8,-7, -6,-2,
+    -2,11, -1,-10,
+    -13,12, -8,10,
+    -7,3, -5,-3,
+    -4,2, -3,7,
+    -10,-12, -6,11,
+    5,-12, 6,-7,
+    5,-6, 7,-1,
+    1,0, 4,-5,
+    9,11, 11,-13,
+    4,7, 4,12,
+    2,-1, 4,4,
+    -4,-12, -2,7,
+    -8,-5, -7,-10,
+    4,11, 9,12,
+    0,-8, 1,-13,
+    -13,-2, -8,2,
+    -3,-2, -2,3,
+    -6,9, -4,-9,
+    8,12, 10,7,
+    0,9, 1,3,
+    7,-5, 11,-10,
+    -13,-6, -11,0,
+    10,7, 12,1,
+    -6,-3, -6,12,
+    10,-9, 12,-4,
+    -13,8, -8,-12,
+    -13,0, -8,-4,
+    3,3, 7,8,
+    5,7, 10,-7,
+    -1,7, 1,-12,
+    3,-10, 5,6,
+    2,-4, 3,-10,
+    -13,0, -13,5,
+    -13,-7, -12,12,
+    -13,3, -11,8,
+    -7,12, -4,7,
+    6,-10, 12,8,
+    -9,-1, -7,-6,
+    -2,-5, 0,12,
+    -12,5, -7,5,
+    3,-10, 8,-13,
+    -7,-7, -4,5,
+    -3,-2, -1,-7,
+    2,9, 5,-11,
+    -11,-13, -5,-13,
+    -1,6, 0,-1,
+    5,-3, 5,2,
+    -4,-13, -4,12,
+    -9,-6, -9,6,
+    -12,-10, -8,-4,
+    10,2, 12,-3,
+    7,12, 12,12,
+    -7,-13, -6,5,
+    -4,9, -3,4,
+    7,-1, 12,2,
+    -7,6, -5,1,
+    -13,11, -12,5,
+    -3,7, -2,-6,
+    7,-8, 12,-7,
+    -13,-7, -11,-12,
+    1,-3, 12,12,
+    2,-6, 3,0,
+    -4,3, -2,-13,
+    -1,-13, 1,9,
+    7,1, 8,-6,
+    1,-1, 3,12,
+    9,1, 12,6,
+    -1,-9, -1,3,
+    -13,-13, -10,5,
+    7,7, 10,12,
+    12,-5, 12,9,
+    6,3, 7,11,
+    5,-13, 6,10,
+    2,-12, 2,3,
+    3,8, 4,-6,
+    2,6, 12,-13,
+    9,-12, 10,3,
+    -8,4, -7,9,
+    -11,12, -4,-6,
+    1,12, 2,-8,
+    6,-9, 7,-4,
+    2,3, 3,-2,
+    6,3, 11,0,
+    3,-3, 8,-8,
+    7,8, 9,3,
+    -11,-5, -6,-4,
+    -10,11, -5,10,
+    -5,-8, -3,12,
+    -10,5, -9,0,
+    8,-1, 12,-6,
+    4,-6, 6,-11,
+    -10,12, -8,7,
+    4,-2, 6,7,
+    -2,0, -2,12,
+    -5,-8, -5,2,
+    7,-6, 10,12,
+    -9,-13, -8,-8,
+    -5,-13, -5,-2,
+    8,-8, 9,-13,
+    -9,-11, -9,0,
+    1,-8, 1,-2,
+    7,-4, 9,1,
+    -2,1, -1,-4,
+    11,-6, 12,-11,
+    -12,-9, -6,4,
+    3,7, 7,12,
+    5,5, 10,8,
+    0,-4, 2,8,
+    -9,12, -5,-13,
+    0,7, 2,12,
+    -1,2, 1,7,
+    5,11, 7,-9,
+    3,5, 6,-8,
+    -13,-4, -8,9,
+    -5,9, -3,-3,
+    -4,-7, -3,-12,
+    6,5, 8,0,
+    -7,6, -6,12,
+    -13,6, -5,-2,
+    1,-10, 3,10,
+    4,1, 8,-4,
+    -2,-2, 2,-13,
+    2,-12, 12,12,
+    -2,-13, 0,-6,
+    4,1, 9,3,
+    -6,-10, -3,-5,
+    -3,-13, -1,1,
+    7,5, 12,-11,
+    4,-2, 5,-7,
+    -13,9, -9,-5,
+    7,1, 8,6,
+    7,-8, 7,6,
+    -7,-4, -7,1,
+    -8,11, -7,-8,
+    -13,6, -12,-8,
+    2,4, 3,9,
+    10,-5, 12,3,
+    -6,-5, -6,7,
+    8,-3, 9,-8,
+    2,-12, 2,8,
+    -11,-2, -10,3,
+    -12,-13, -7,-9,
+    -11,0, -10,-5,
+    5,-3, 11,8,
+    -2,-13, -1,12,
+    -1,-8, 0,9,
+    -13,-11, -12,-5,
+    -10,-2, -10,11,
+    -3,9, -2,-13,
+    2,-3, 3,2,
+    -9,-13, -4,0,
+    -4,6, -3,-10,
+    -4,12, -2,-7,
+    -6,-11, -4,9,
+    6,-3, 6,11,
+    -13,11, -5,5,
+    11,11, 12,6,
+    7,-5, 12,-2,
+    -1,12, 0,7,
+    -4,-8, -3,-2,
+    -7,1, -6,7,
+    -13,-12, -8,-13,
+    -7,-2, -6,-8,
+    -8,5, -6,-9,
+    -5,-1, -4,5,
+    -13,7, -8,10,
+    1,5, 5,-13,
+    1,0, 10,-13,
+    9,12, 10,-1,
+    5,-8, 10,-9,
+    -1,11, 1,-13,
+    -9,-3, -6,2,
+    -1,-10, 1,12,
+    -13,1, -8,-10,
+    8,-11, 10,-6,
+    2,-13, 3,-6,
+    7,-13, 12,-9,
+    -10,-10, -5,-7,
+    -10,-8, -8,-13,
+    4,-6, 8,5,
+    3,12, 8,-13,
+    -4,2, -3,-3,
+    5,-13, 10,-12,
+    4,-13, 5,-1,
+    -9,9, -4,3,
+    0,3, 3,-9,
+    -12,1, -6,1,
+    3,2, 4,-8,
+    -10,-10, -10,9,
+    8,-13, 12,12,
+    -8,-12, -6,-5,
+    2,2, 3,7,
+    10,6, 11,-8,
+    6,8, 8,-12,
+    -7,10, -6,5,
+    -3,-9, -3,9,
+    -1,-13, -1,5,
+    -3,-7, -3,4,
+    -8,-2, -8,3,
+    4,2, 12,12,
+    2,-5, 3,11,
+    6,-9, 11,-13,
+    3,-1, 7,12,
+    11,-1, 12,4,
+    -3,0, -3,6,
+    4,-11, 4,12,
+    2,-4, 2,1,
+    -10,-6, -8,1,
+    -13,7, -11,1,
+    -13,12, -11,-13,
+    6,0, 11,-13,
+    0,-1, 1,4,
+    -13,3, -9,-2,
+    -9,8, -6,-3,
+    -13,-6, -8,-2,
+    5,-9, 8,10,
+    2,7, 3,-9,
+    -1,-6, -1,-1,
+    9,5, 11,-2,
+    11,-3, 12,-8,
+    3,0, 3,5,
+    -1,4, 0,10,
+    3,-6, 4,5,
+    -13,0, -10,5,
+    5,8, 12,11,
+    8,9, 9,-6,
+    7,-4, 8,-12,
+    -10,4, -10,9,
+    7,3, 12,4,
+    9,-7, 10,-2,
+    7,0, 12,-2,
+    -1,-6, 0,-11,
+};
+
+template<typename T>
+void keep_features(
+    float* x_out,
+    float* y_out,
+    float* score_out,
+    float* size_out,
+    const float* x_in,
+    const float* y_in,
+    const float* score_in,
+    const unsigned* score_idx,
+    const float* size_in,
+    const unsigned n_feat)
+{
+    // Keep only the first n_feat features
+    for (unsigned f = 0; f < n_feat; f++) {
+        x_out[f] = x_in[score_idx[f]];
+        y_out[f] = y_in[score_idx[f]];
+        score_out[f] = score_in[f];
+        if (size_in != nullptr && size_out != nullptr)
+            size_out[f] = size_in[score_idx[f]];
+    }
+}
+
+template<typename T, bool use_scl>
+void harris_response(
+    float* x_out,
+    float* y_out,
+    float* score_out,
+    float* size_out,
+    const float* x_in,
+    const float* y_in,
+    const float* scl_in,
+    const unsigned total_feat,
+    unsigned* usable_feat,
+    const Array<T>& image,
+    const unsigned block_size,
+    const float k_thr,
+    const unsigned patch_size)
+{
+    const af::dim4 idims = image.dims();
+    const T* image_ptr = image.get();
+    for (unsigned f = 0; f < total_feat; f++) {
+        unsigned x, y;
+        float scl = 1.f;
+        if (use_scl) {
+            // Update x and y coordinates according to scale
+            scl = scl_in[f];
+            x = (unsigned)round(x_in[f] * scl);
+            y = (unsigned)round(y_in[f] * scl);
+        }
+        else {
+            x = (unsigned)round(x_in[f]);
+            y = (unsigned)round(y_in[f]);
+        }
+
+        // Round feature size to nearest odd integer
+        float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f;
+
+        // Avoid keeping features that might be too wide and might not fit on
+        // the image, sqrt(2.f) is the radius when angle is 45 degrees and
+        // represents widest case possible
+        unsigned patch_r = ceil(size * sqrt(2.f) / 2.f);
+        if (x < patch_r || y < patch_r || x >= idims[1] - patch_r || y >= idims[0] - patch_r)
+            continue;
+
+        unsigned r = block_size / 2;
+
+        float ixx = 0.f, iyy = 0.f, ixy = 0.f;
+        unsigned block_size_sq = block_size * block_size;
+        for (unsigned k = 0; k < block_size_sq; k++) {
+            int i = k / block_size - r;
+            int j = k % block_size - r;
+
+            // Calculate local x and y derivatives
+            float ix = image_ptr[(x+i+1) * idims[0] + y+j] - image_ptr[(x+i-1) * idims[0] + y+j];
+            float iy = image_ptr[(x+i) * idims[0] + y+j+1] - image_ptr[(x+i) * idims[0] + y+j-1];
+
+            // Accumulate second order derivatives
+            ixx += ix*ix;
+            iyy += iy*iy;
+            ixy += ix*iy;
+        }
+
+        unsigned idx = *usable_feat;
+        *usable_feat += 1;
+        float tr = ixx + iyy;
+        float det = ixx*iyy - ixy*ixy;
+
+        // Calculate Harris responses
+        float resp = det - k_thr * (tr*tr);
+
+        // Scale factor
+        // TODO: improve response scaling
+        float rscale = 0.001f;
+        rscale = rscale * rscale * rscale * rscale;
+
+        x_out[idx] = x;
+        y_out[idx] = y;
+        score_out[idx] = resp * rscale;
+        if (use_scl)
+            size_out[idx] = size;
+    }
+}
+
+template<typename T>
+void centroid_angle(
+    const float* x_in,
+    const float* y_in,
+    float* orientation_out,
+    const unsigned total_feat,
+    const Array<T>& image,
+    const unsigned patch_size)
+{
+    const af::dim4 idims = image.dims();
+    const T* image_ptr = image.get();
+    for (unsigned f = 0; f < total_feat; f++) {
+        unsigned x = (unsigned)round(x_in[f]);
+        unsigned y = (unsigned)round(y_in[f]);
+
+        unsigned r = patch_size / 2;
+        if (x < r || y < r || x > idims[1] - r || y > idims[0] - r)
+            continue;
+
+        T m01 = (T)0, m10 = (T)0;
+        unsigned patch_size_sq = patch_size * patch_size;
+        for (unsigned k = 0; k < patch_size_sq; k++) {
+            int i = k / patch_size - r;
+            int j = k % patch_size - r;
+
+            // Calculate first order moments
+            T p = image_ptr[(x+i) * idims[0] + y+j];
+            m01 += j * p;
+            m10 += i * p;
+        }
+
+        float angle = atan2(m01, m10);
+        orientation_out[f] = angle;
+    }
+}
+
+template<typename T>
+inline T get_pixel(
+    unsigned x,
+    unsigned y,
+    const float ori,
+    const unsigned size,
+    const int dist_x,
+    const int dist_y,
+    const Array<T>& image,
+    const unsigned patch_size)
+{
+    const af::dim4 idims = image.dims();
+    const T* image_ptr = image.get();
+    float ori_sin = sin(ori);
+    float ori_cos = cos(ori);
+    float patch_scl = (float)size / (float)patch_size;
+
+    // Calculate point coordinates based on orientation and size
+    x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
+    y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
+
+    return image_ptr[x * idims[0] + y];
+}
+
+template<typename T>
+void extract_orb(
+    unsigned* desc_out,
+    const unsigned n_feat,
+    float* x_in_out,
+    float* y_in_out,
+    const float* ori_in,
+    float* size_out,
+    const Array<T>& image,
+    const float scl,
+    const unsigned patch_size)
+{
+    const af::dim4 idims = image.dims();
+    for (unsigned f = 0; f < n_feat; f++) {
+        unsigned x = (unsigned)round(x_in_out[f]);
+        unsigned y = (unsigned)round(y_in_out[f]);
+        float ori = ori_in[f];
+        unsigned size = patch_size;
+
+        unsigned r = ceil(patch_size * sqrt(2.f) / 2.f);
+        if (x < r || y < r || x >= idims[1] - r || y >= idims[0] - r)
+            continue;
+
+        // Descriptor fixed at 256 bits for now
+        // Storing descriptor as a vector of 8 x 32-bit unsigned numbers
+        for (unsigned i = 0; i < 8; i++) {
+            unsigned v = 0;
+
+            // j < 32 for 256 bits descriptor
+            for (unsigned j = 0; j < 32; j++) {
+                // Get position from distribution pattern and values of points p1 and p2
+                int dist_x = ref_pat[i*32*4 + j*4];
+                int dist_y = ref_pat[i*32*4 + j*4+1];
+                T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);
+
+                dist_x = ref_pat[i*32*4 + j*4+2];
+                dist_y = ref_pat[i*32*4 + j*4+3];
+                T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);
+
+                // Calculate bit based on p1 and p2 and shifts it to correct position
+                v |= (p1 < p2) << j;
+            }
+
+            // Store 32 bits of descriptor
+            desc_out[f * 8 + i] += v;
+        }
+
+        x_in_out[f] = round(x * scl);
+        y_in_out[f] = round(y * scl);
+        size_out[f] = patch_size * scl;
+    }
+}
+
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/random.hpp b/src/backend/cpu/kernel/random.hpp
new file mode 100644
index 0000000000..9c59a64db9
--- /dev/null
+++ b/src/backend/cpu/kernel/random.hpp
@@ -0,0 +1,200 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <type_traits>
+#include <random>
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <type_traits>
+
+namespace cpu
+{
+namespace kernel
+{
+
+using namespace std;
+
+#if defined(_WIN32)
+    #define __THREAD_LOCAL static __declspec(thread)
+#else
+    #define __THREAD_LOCAL static __thread
+#endif
+
+template<typename T>
+using is_arithmetic_t       = typename enable_if< is_arithmetic<T>::value,      function<T()>>::type;
+template<typename T>
+using is_complex_t          = typename enable_if< is_complex<T>::value,         function<T()>>::type;
+template<typename T>
+using is_floating_point_t   = typename enable_if< is_floating_point<T>::value,  function<T()>>::type;
+
+template<typename T, typename GenType>
+is_arithmetic_t<T>
+urand(GenType &generator)
+{
+    typedef typename conditional<   is_floating_point<T>::value,
+                                    uniform_real_distribution<T>,
+#if OS_WIN
+                                    uniform_int_distribution<unsigned>>::type dist;
+#else
+                                    uniform_int_distribution<T >> ::type dist;
+#endif
+    return bind(dist(), generator);
+}
+
+template<typename T, typename GenType>
+is_complex_t<T>
+urand(GenType &generator)
+{
+    auto func = urand<typename T::value_type>(generator);
+    return [func] () { return T(func(), func());};
+}
+
+template<typename T, typename GenType>
+is_floating_point_t<T>
+nrand(GenType &generator)
+{
+    return bind(normal_distribution<T>(), generator);
+}
+
+template<typename T, typename GenType>
+is_complex_t<T>
+nrand(GenType &generator)
+{
+    auto func = nrand<typename T::value_type>(generator);
+    return [func] () { return T(func(), func());};
+}
+
+mt19937& getGenerator()
+{
+    // FIXME: This abomination of a work around is brought to you
+    // by incomplete standards from Xcode and Visual Studio
+    // Should ideally be using thread_local on object instead of pointer
+    __THREAD_LOCAL mt19937 *generator = NULL;
+    if (generator == NULL) generator = new mt19937();
+    return *generator;
+}
+
+unsigned long long& getSeed()
+{
+    __THREAD_LOCAL unsigned long long gen_seed = 0;
+    return gen_seed;
+}
+
+void getSeedPtr(unsigned long long *seed)
+{
+    *seed = getSeed();
+}
+
+bool& isFirst()
+{
+    __THREAD_LOCAL bool is_first = true;
+    return is_first;
+}
+
+void setSeed(const uintl seed)
+{
+    getGenerator().seed(seed);
+    getSeed() = seed;
+    isFirst() = false;
+}
+
+//FIXME: See if we can use functors instead of function pointer directly
+template<typename T>
+struct RandomDistribution
+{
+    std::function<T()> func;
+    RandomDistribution(std::function<T()> dist_func) : func(dist_func)
+    {
+    }
+};
+
+template<typename T>
+void randn(Array<T> out)
+{
+    __THREAD_LOCAL unsigned long long my_seed = 0;
+    if (isFirst()) {
+        my_seed = getSeed();
+        setSeed(my_seed);
+    }
+
+    // FIXME: This abomination of a work around is brought to you
+    // by incomplete standards from Xcode and Visual Studio
+    // Should ideally be using thread_local on object instead of pointer
+    __THREAD_LOCAL RandomDistribution<T> *distPtr = NULL;
+
+    if (!distPtr || my_seed != getSeed()) {
+        if (distPtr) delete distPtr;
+        distPtr = new RandomDistribution<T>(nrand<T>(getGenerator()));
+        my_seed = getSeed();
+    }
+
+    T *outPtr = out.get();
+    for (int i = 0; i < (int)out.elements(); i++) {
+        outPtr[i] = distPtr->func();
+    }
+}
+
+template<typename T>
+void randu(Array<T> out)
+{
+    __THREAD_LOCAL unsigned long long my_seed = 0;
+    if (isFirst()) {
+        my_seed = getSeed();
+        setSeed(my_seed);
+    }
+
+    // FIXME: This abomination of a work around is brought to you
+    // by incomplete standards from Xcode and Visual Studio
+    // Should ideally be using thread_local on object instead of pointer
+    __THREAD_LOCAL RandomDistribution<T> *distPtr = NULL;
+
+    if (!distPtr || my_seed != getSeed()) {
+        if (distPtr) delete distPtr;
+        distPtr = new RandomDistribution<T>(urand<T>(getGenerator()));
+        my_seed = getSeed();
+    }
+
+    T *outPtr = out.get();
+    for (int i = 0; i < (int)out.elements(); i++) {
+        outPtr[i] = distPtr->func();
+    }
+}
+
+template<>
+void randu(Array<char> out)
+{
+    __THREAD_LOCAL unsigned long long my_seed = 0;
+    if (isFirst()) {
+        my_seed = getSeed();
+        setSeed(my_seed);
+    }
+
+    // FIXME: This abomination of a work around is brought to you
+    // by incomplete standards from Xcode and Visual Studio
+    // Should ideally be using thread_local on object instead of pointer
+    __THREAD_LOCAL RandomDistribution<float> *distPtr = NULL;
+
+    if (!distPtr || my_seed != getSeed()) {
+        if (distPtr) delete distPtr;
+        distPtr = new RandomDistribution<float>(nrand<float>(getGenerator()));
+        my_seed = getSeed();
+    }
+
+    char *outPtr = out.get();
+    for (int i = 0; i < (int)out.elements(); i++) {
+        outPtr[i] = distPtr->func() > 0.5;
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/range.hpp b/src/backend/cpu/kernel/range.hpp
new file mode 100644
index 0000000000..b244a19c85
--- /dev/null
+++ b/src/backend/cpu/kernel/range.hpp
@@ -0,0 +1,52 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, int dim>
+void range(Array<T> output)
+{
+    T* out = output.get();
+
+    const dim4 dims = output.dims();
+    const dim4 strides = output.strides();
+
+    for(dim_t w = 0; w < dims[3]; w++) {
+        dim_t offW = w * strides[3];
+        for(dim_t z = 0; z < dims[2]; z++) {
+            dim_t offWZ = offW + z * strides[2];
+            for(dim_t y = 0; y < dims[1]; y++) {
+                dim_t offWZY = offWZ + y * strides[1];
+                for(dim_t x = 0; x < dims[0]; x++) {
+                    dim_t id = offWZY + x;
+                    if(dim == 0) {
+                        out[id] = x;
+                    } else if(dim == 1) {
+                        out[id] = y;
+                    } else if(dim == 2) {
+                        out[id] = z;
+                    } else if(dim == 3) {
+                        out[id] = w;
+                    }
+                }
+            }
+        }
+    }
+}
+
+}
+}
+
diff --git a/src/backend/cpu/kernel/reduce.hpp b/src/backend/cpu/kernel/reduce.hpp
new file mode 100644
index 0000000000..85119dcee7
--- /dev/null
+++ b/src/backend/cpu/kernel/reduce.hpp
@@ -0,0 +1,71 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<af_op_t op, typename Ti, typename To, int D>
+struct reduce_dim
+{
+    void operator()(Array<To> out, const dim_t outOffset,
+                    const Array<Ti> in, const dim_t inOffset,
+                    const int dim, bool change_nan, double nanval)
+    {
+        static const int D1 = D - 1;
+        static reduce_dim<op, Ti, To, D1> reduce_dim_next;
+
+        const af::dim4 ostrides = out.strides();
+        const af::dim4 istrides = in.strides();
+        const af::dim4 odims    = out.dims();
+
+        for (dim_t i = 0; i < odims[D1]; i++) {
+            reduce_dim_next(out, outOffset + i * ostrides[D1],
+                            in, inOffset + i * istrides[D1],
+                            dim, change_nan, nanval);
+        }
+    }
+};
+
+template<af_op_t op, typename Ti, typename To>
+struct reduce_dim<op, Ti, To, 0>
+{
+
+    Transform<Ti, To, op> transform;
+    Binary<To, op> reduce;
+    void operator()(Array<To> out, const dim_t outOffset,
+                    const Array<Ti> in, const dim_t inOffset,
+                    const int dim, bool change_nan, double nanval)
+    {
+        const af::dim4 istrides = in.strides();
+        const af::dim4 idims    = in.dims();
+
+        To * const outPtr = out.get() + outOffset;
+        Ti const * const inPtr = in.get() + inOffset;
+        dim_t stride = istrides[dim];
+
+        To out_val = reduce.init();
+        for (dim_t i = 0; i < idims[dim]; i++) {
+            To in_val = transform(inPtr[i * stride]);
+            if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
+            out_val = reduce(in_val, out_val);
+        }
+
+        *outPtr = out_val;
+    }
+};
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/regions.hpp b/src/backend/cpu/kernel/regions.hpp
new file mode 100644
index 0000000000..863ebc5f48
--- /dev/null
+++ b/src/backend/cpu/kernel/regions.hpp
@@ -0,0 +1,194 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+class LabelNode
+{
+private:
+    T label;
+    T minLabel;
+    unsigned rank;
+    LabelNode* parent;
+
+public:
+    LabelNode() : label(0), minLabel(0), rank(0), parent(this) { }
+    LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { }
+
+    T getLabel()
+    {
+        return label;
+    }
+
+    T getMinLabel()
+    {
+        return minLabel;
+    }
+
+    LabelNode* getParent()
+    {
+        return parent;
+    }
+
+    unsigned getRank()
+    {
+        return rank;
+    }
+
+    void setMinLabel(T l)
+    {
+        minLabel = l;
+    }
+
+    void setParent(LabelNode* p)
+    {
+        parent = p;
+    }
+
+    void setRank(unsigned r)
+    {
+        rank = r;
+    }
+};
+
+template<typename T>
+static LabelNode<T>* find(LabelNode<T>* x)
+{
+    if (x->getParent() != x)
+        x->setParent(find(x->getParent()));
+    return x->getParent();
+}
+
+template<typename T>
+static void setUnion(LabelNode<T>* x, LabelNode<T>* y)
+{
+    LabelNode<T>* xRoot = find(x);
+    LabelNode<T>* yRoot = find(y);
+    if (xRoot == yRoot)
+        return;
+
+    T xMinLabel = xRoot->getMinLabel();
+    T yMinLabel = yRoot->getMinLabel();
+    xRoot->setMinLabel(min(xMinLabel, yMinLabel));
+    yRoot->setMinLabel(min(xMinLabel, yMinLabel));
+
+    if (xRoot->getRank() < yRoot->getRank())
+        xRoot->setParent(yRoot);
+    else if (xRoot->getRank() > yRoot->getRank())
+        yRoot->setParent(xRoot);
+    else {
+        yRoot->setParent(xRoot);
+        xRoot->setRank(xRoot->getRank() + 1);
+    }
+}
+
+template<typename T>
+void regions(Array<T> out, const Array<char> in, af_connectivity connectivity)
+{
+    const af::dim4 in_dims = in.dims();
+    const char *in_ptr  = in.get();
+    T    *out_ptr = out.get();
+
+    // Map labels
+    typedef typename std::map<T, LabelNode<T>* > label_map_t;
+    typedef typename label_map_t::iterator label_map_iterator_t;
+
+    label_map_t lmap;
+
+    // Initial label
+    T label = (T)1;
+
+    for (int j = 0; j < (int)in_dims[1]; j++) {
+        for (int i = 0; i < (int)in_dims[0]; i++) {
+            int idx = j * in_dims[0] + i;
+            if (in_ptr[idx] != 0) {
+                std::vector<T> l;
+
+                // Test neighbors
+                if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0)
+                    l.push_back(out_ptr[j * in_dims[0] + i-1]);
+                if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0)
+                    l.push_back(out_ptr[(j-1) * in_dims[0] + i]);
+                if (connectivity == AF_CONNECTIVITY_8 && i > 0 &&
+                        j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0)
+                    l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]);
+                if (connectivity == AF_CONNECTIVITY_8 &&
+                        i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0)
+                    l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]);
+
+                if (!l.empty()) {
+                    T minl = l[0];
+                    for (size_t k = 0; k < l.size(); k++) {
+                        minl = min(l[k], minl);
+                        label_map_iterator_t cur_map = lmap.find(l[k]);
+                        LabelNode<T> *node = cur_map->second;
+                        // Group labels of the same region under a disjoint set
+                        for (size_t m = k+1; m < l.size(); m++)
+                            setUnion(node, lmap.find(l[m])->second);
+                    }
+                    // Set label to smallest neighbor label
+                    out_ptr[idx] = minl;
+                }
+                else {
+                    // Insert new label in map
+                    LabelNode<T> *node = new LabelNode<T>(label);
+                    lmap.insert(std::pair<T, LabelNode<T>* >(label, node));
+                    out_ptr[idx] = label++;
+                }
+            }
+        }
+    }
+
+    std::set<T> removed;
+
+    for (int j = 0; j < (int)in_dims[1]; j++) {
+        for (int i = 0; i < (int)in_dims[0]; i++) {
+            int idx = j * (int)in_dims[0] + i;
+            if (in_ptr[idx] != 0) {
+                T l = out_ptr[idx];
+                label_map_iterator_t cur_map = lmap.find(l);
+
+                if (cur_map != lmap.end()) {
+                    LabelNode<T>* node = cur_map->second;
+
+                    LabelNode<T>* node_root = find(node);
+                    out_ptr[idx] = node_root->getMinLabel();
+
+                    // Mark removed labels (those that are part of a region
+                    // that contains a smaller label)
+                    if (node->getMinLabel() < l || node_root->getMinLabel() < l)
+                        removed.insert(l);
+                    if (node->getLabel() > node->getMinLabel())
+                        removed.insert(node->getLabel());
+                }
+            }
+        }
+    }
+
+    // Calculate final neighbors (ensure final labels are sequential)
+    for (int j = 0; j < (int)in_dims[1]; j++) {
+        for (int i = 0; i < (int)in_dims[0]; i++) {
+            int idx = j * (int)in_dims[0] + i;
+            if (out_ptr[idx] > 0) {
+                out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx]));
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/reorder.hpp b/src/backend/cpu/kernel/reorder.hpp
new file mode 100644
index 0000000000..c10c96ef36
--- /dev/null
+++ b/src/backend/cpu/kernel/reorder.hpp
@@ -0,0 +1,55 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void reorder(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
+{
+    T* outPtr = out.get();
+    const T* inPtr = in.get();
+
+    const af::dim4 ist = in.strides();
+    const af::dim4 ost = out.strides();
+
+
+    dim_t ids[4]  = {0};
+    for(dim_t ow = 0; ow < oDims[3]; ow++) {
+        const dim_t oW = ow * ost[3];
+        ids[rdims[3]] = ow;
+        for(dim_t oz = 0; oz < oDims[2]; oz++) {
+            const dim_t oZW = oW + oz * ost[2];
+            ids[rdims[2]] = oz;
+            for(dim_t oy = 0; oy < oDims[1]; oy++) {
+                const dim_t oYZW = oZW + oy * ost[1];
+                ids[rdims[1]] = oy;
+                for(dim_t ox = 0; ox < oDims[0]; ox++) {
+                    const dim_t oIdx = oYZW + ox;
+
+                    ids[rdims[0]] = ox;
+                    const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] +
+                                          ids[1] * ist[1] + ids[0];
+
+                    outPtr[oIdx] = inPtr[iIdx];
+                }
+            }
+        }
+    }
+}
+
+}
+}
+
diff --git a/src/backend/cpu/kernel/resize.hpp b/src/backend/cpu/kernel/resize.hpp
new file mode 100644
index 0000000000..19d7ec7cf1
--- /dev/null
+++ b/src/backend/cpu/kernel/resize.hpp
@@ -0,0 +1,177 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+/**
+ * noop function for round to avoid compilation
+ * issues due to lack of this function in C90 based
+ * compilers, it is only present in C99 and C++11
+ *
+ * This is not a full fledged implementation, this function
+ * is to be used only for positive numbers, i m using it here
+ * for calculating dimensions of arrays
+ */
+dim_t round2int(float value)
+{
+    return (dim_t)(value+0.5f);
+}
+
+using std::conditional;
+using std::is_same;
+
+template<typename T>
+using wtype_t = typename conditional<is_same<T, double>::value, double, float>::type;
+
+template<typename T>
+using vtype_t = typename conditional<is_complex<T>::value,
+                                     T, wtype_t<T>
+                                    >::type;
+
+template<typename T, af_interp_type method>
+struct resize_op
+{
+    void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
+              const af::dim4 &ostrides, const af::dim4 &istrides,
+              const dim_t x, const dim_t y)
+    {
+        return;
+    }
+};
+
+template<typename T>
+struct resize_op<T, AF_INTERP_NEAREST>
+{
+    void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
+            const af::dim4 &ostrides, const af::dim4 &istrides,
+            const dim_t x, const dim_t y)
+    {
+        // Compute Indices
+        dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0]));
+        dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1]));
+
+        if (i_x >= idims[0]) i_x = idims[0] - 1;
+        if (i_y >= idims[1]) i_y = idims[1] - 1;
+
+        dim_t i_off = i_y * istrides[1] + i_x;
+        dim_t o_off =   y * ostrides[1] + x;
+        // Copy values from all channels
+        for(dim_t w = 0; w < odims[3]; w++) {
+            dim_t wost = w * ostrides[3];
+            dim_t wist = w * istrides[3];
+            for(dim_t z = 0; z < odims[2]; z++) {
+                outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist];
+            }
+        }
+    }
+};
+
+template<typename T>
+struct resize_op<T, AF_INTERP_BILINEAR>
+{
+    void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
+            const af::dim4 &ostrides, const af::dim4 &istrides,
+            const dim_t x, const dim_t y)
+    {
+        // Compute Indices
+        float f_x = (float)x / (odims[0] / (float)idims[0]);
+        float f_y = (float)y / (odims[1] / (float)idims[1]);
+
+        dim_t i1_x  = floor(f_x);
+        dim_t i1_y  = floor(f_y);
+
+        if (i1_x >= idims[0]) i1_x = idims[0] - 1;
+        if (i1_y >= idims[1]) i1_y = idims[1] - 1;
+
+        float b   = f_x - i1_x;
+        float a   = f_y - i1_y;
+
+        dim_t i2_x  = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1);
+        dim_t i2_y  = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1);
+
+        typedef typename dtype_traits<T>::base_type BT;
+        typedef wtype_t<BT> WT;
+        typedef vtype_t<T> VT;
+
+        dim_t o_off = y * ostrides[1] + x;
+        // Copy values from all channels
+        for(dim_t w = 0; w < odims[3]; w++) {
+            dim_t wst = w * istrides[3];
+            for(dim_t z = 0; z < odims[2]; z++) {
+                dim_t zst = z * istrides[2];
+                dim_t channel_off = zst + wst;
+                VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off];
+                VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off];
+                VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off];
+                VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off];
+
+                outPtr[o_off + z * ostrides[2] + w * ostrides[3]] =
+                                scalar<WT>((1.0f - a) * (1.0f - b)) * p1 +
+                                scalar<WT>((    a   ) * (1.0f - b)) * p2 +
+                                scalar<WT>((1.0f - a) * (    b   )) * p3 +
+                                scalar<WT>((    a   ) * (    b   )) * p4;
+            }
+        }
+    }
+};
+
+template<typename T>
+struct resize_op<T, AF_INTERP_LOWER>
+{
+    void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
+            const af::dim4 &ostrides, const af::dim4 &istrides,
+            const dim_t x, const dim_t y)
+    {
+        // Compute Indices
+        dim_t i_x = floor((float)x / (odims[0] / (float)idims[0]));
+        dim_t i_y = floor((float)y / (odims[1] / (float)idims[1]));
+
+        if (i_x >= idims[0]) i_x = idims[0] - 1;
+        if (i_y >= idims[1]) i_y = idims[1] - 1;
+
+        dim_t i_off = i_y * istrides[1] + i_x;
+        dim_t o_off =   y * ostrides[1] + x;
+        // Copy values from all channels
+        for(dim_t w = 0; w < odims[3]; w++) {
+            dim_t wost = w * ostrides[3];
+            dim_t wist = w * istrides[3];
+            for(dim_t z = 0; z < odims[2]; z++) {
+                outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist];
+            }
+        }
+    }
+};
+
+template<typename T, af_interp_type method>
+void resize(Array<T> out, const Array<T> in)
+{
+    af::dim4 idims    = in.dims();
+    af::dim4 odims    = out.dims();
+    const T *inPtr    = in.get();
+          T *outPtr   = out.get();
+    af::dim4 ostrides = out.strides();
+    af::dim4 istrides = in.strides();
+
+    resize_op<T, method> op;
+    for(dim_t y = 0; y < odims[1]; y++) {
+        for(dim_t x = 0; x < odims[0]; x++) {
+            op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y);
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/rotate.hpp b/src/backend/cpu/kernel/rotate.hpp
new file mode 100644
index 0000000000..395ea3f303
--- /dev/null
+++ b/src/backend/cpu/kernel/rotate.hpp
@@ -0,0 +1,84 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+#include <err_cpu.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, af_interp_type method>
+void rotate(Array<T> output, const Array<T> input, const float theta)
+{
+    const af::dim4 odims    = output.dims();
+    const af::dim4 idims    = input.dims();
+    const af::dim4 ostrides = output.strides();
+    const af::dim4 istrides = input.strides();
+
+    const T* in   = input.get();
+          T* out  = output.get();
+    dim_t nimages = idims[2];
+
+    void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
+                 const af::dim4 &, const af::dim4 &,
+                 const dim_t, const dim_t, const dim_t, const dim_t,
+                 const bool);
+
+    const float c = cos(-theta), s = sin(-theta);
+    float tx, ty;
+    {
+        const float nx = 0.5 * (idims[0] - 1);
+        const float ny = 0.5 * (idims[1] - 1);
+        const float mx = 0.5 * (odims[0] - 1);
+        const float my = 0.5 * (odims[1] - 1);
+        const float sx = (mx * c + my *-s);
+        const float sy = (mx * s + my * c);
+        tx = -(sx - nx);
+        ty = -(sy - ny);
+    }
+
+    const float tmat[6] = {std::round( c * 1000) / 1000.0f,
+                           std::round(-s * 1000) / 1000.0f,
+                           std::round(tx * 1000) / 1000.0f,
+                           std::round( s * 1000) / 1000.0f,
+                           std::round( c * 1000) / 1000.0f,
+                           std::round(ty * 1000) / 1000.0f,
+                          };
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            t_fn = &transform_n;
+            break;
+        case AF_INTERP_BILINEAR:
+            t_fn = &transform_b;
+            break;
+        case AF_INTERP_LOWER:
+            t_fn = &transform_l;
+            break;
+        default:
+            AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+            break;
+    }
+
+
+    // Do transform for image
+    for(int yy = 0; yy < (int)odims[1]; yy++) {
+        for(int xx = 0; xx < (int)odims[0]; xx++) {
+            t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy, false);
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/scan.hpp b/src/backend/cpu/kernel/scan.hpp
new file mode 100644
index 0000000000..0bcfe7df17
--- /dev/null
+++ b/src/backend/cpu/kernel/scan.hpp
@@ -0,0 +1,72 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<af_op_t op, typename Ti, typename To, int D>
+struct scan_dim
+{
+    void operator()(Array<To> out, dim_t outOffset,
+                    const Array<Ti> in, dim_t inOffset,
+                    const int dim) const
+    {
+        const dim4 odims    = out.dims();
+        const dim4 ostrides = out.strides();
+        const dim4 istrides = in.strides();
+
+        const int D1 = D - 1;
+        for (dim_t i = 0; i < odims[D1]; i++) {
+            scan_dim<op, Ti, To, D1> func;
+            getQueue().enqueue(func,
+                    out, outOffset + i * ostrides[D1],
+                    in, inOffset + i * istrides[D1], dim);
+            if (D1 == dim) break;
+        }
+    }
+};
+
+template<af_op_t op, typename Ti, typename To>
+struct scan_dim<op, Ti, To, 0>
+{
+    void operator()(Array<To> output, dim_t outOffset,
+                    const Array<Ti> input,  dim_t inOffset,
+                    const int dim) const
+    {
+        const Ti* in = input.get() + inOffset;
+              To* out= output.get()+ outOffset;
+
+        const dim4 ostrides = output.strides();
+        const dim4 istrides = input.strides();
+        const dim4 idims    = input.dims();
+
+        dim_t istride = istrides[dim];
+        dim_t ostride = ostrides[dim];
+
+        Transform<Ti, To, op> transform;
+        // FIXME: Change the name to something better
+        Binary<To, op> scan;
+
+        To out_val = scan.init();
+        for (dim_t i = 0; i < idims[dim]; i++) {
+            To in_val = transform(in[i * istride]);
+            out_val = scan(in_val, out_val);
+            out[i * ostride] = out_val;
+        }
+    }
+};
+
+}
+}
diff --git a/src/backend/cpu/kernel/select.hpp b/src/backend/cpu/kernel/select.hpp
new file mode 100644
index 0000000000..1099c7e437
--- /dev/null
+++ b/src/backend/cpu/kernel/select.hpp
@@ -0,0 +1,124 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void select(Array<T> out, const Array<char> cond, const Array<T> a, const Array<T> b)
+{
+    af::dim4 adims = a.dims();
+    af::dim4 astrides = a.strides();
+    af::dim4 bdims = b.dims();
+    af::dim4 bstrides = b.strides();
+
+    af::dim4 cdims = cond.dims();
+    af::dim4 cstrides = cond.strides();
+
+    af::dim4 odims = out.dims();
+    af::dim4 ostrides = out.strides();
+
+    bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1],
+        adims[2] == odims[2], adims[3] == odims[3]};
+
+    bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1],
+        bdims[2] == odims[2], bdims[3] == odims[3]};
+
+    bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1],
+        cdims[2] == odims[2], cdims[3] == odims[3]};
+
+    const T *aptr = a.get();
+    const T *bptr = b.get();
+    T *optr = out.get();
+    const char *cptr = cond.get();
+
+    for (int l = 0; l < odims[3]; l++) {
+
+        int o_off3   = ostrides[3] * l;
+        int a_off3   = astrides[3] * is_a_same[3] * l;
+        int b_off3   = bstrides[3] * is_b_same[3] * l;
+        int c_off3   = cstrides[3] * is_c_same[3] * l;
+
+        for (int k = 0; k < odims[2]; k++) {
+
+            int o_off2   = ostrides[2] * k + o_off3;
+            int a_off2   = astrides[2] * is_a_same[2] * k + a_off3;
+            int b_off2   = bstrides[2] * is_b_same[2] * k + b_off3;
+            int c_off2   = cstrides[2] * is_c_same[2] * k + c_off3;
+
+            for (int j = 0; j < odims[1]; j++) {
+
+                int o_off1   = ostrides[1] * j + o_off2;
+                int a_off1   = astrides[1] * is_a_same[1] * j + a_off2;
+                int b_off1   = bstrides[1] * is_b_same[1] * j + b_off2;
+                int c_off1   = cstrides[1] * is_c_same[1] * j + c_off2;
+
+                for (int i = 0; i < odims[0]; i++) {
+
+                    bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1];
+                    T    aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1];
+                    T    bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1];
+                    T    oval = cval ? aval : bval;
+                    optr[o_off1 + i] = oval;
+                }
+            }
+        }
+    }
+}
+
+template<typename T, bool flip>
+void select_scalar(Array<T> out, const Array<char> cond, const Array<T> a, const double b)
+{
+    af::dim4 astrides = a.strides();
+    af::dim4 cstrides = cond.strides();
+
+    af::dim4 odims = out.dims();
+    af::dim4 ostrides = out.strides();
+
+    const T *aptr = a.get();
+    T *optr = out.get();
+    const char *cptr = cond.get();
+
+    for (int l = 0; l < odims[3]; l++) {
+
+        int o_off3 = ostrides[3] * l;
+        int a_off3 = astrides[3] * l;
+        int c_off3 = cstrides[3] * l;
+
+        for (int k = 0; k < odims[2]; k++) {
+
+            int o_off2 = ostrides[2] * k + o_off3;
+            int a_off2 = astrides[2] * k + a_off3;
+            int c_off2 = cstrides[2] * k + c_off3;
+
+            for (int j = 0; j < odims[1]; j++) {
+
+                int o_off1 = ostrides[1] * j + o_off2;
+                int a_off1 = astrides[1] * j + a_off2;
+                int c_off1 = cstrides[1] * j + c_off2;
+
+                for (int i = 0; i < odims[0]; i++) {
+
+                    optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b;
+                }
+            }
+        }
+    }
+}
+
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/shift.hpp b/src/backend/cpu/kernel/shift.hpp
new file mode 100644
index 0000000000..8beb975486
--- /dev/null
+++ b/src/backend/cpu/kernel/shift.hpp
@@ -0,0 +1,69 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <cassert>
+
+namespace cpu
+{
+namespace kernel
+{
+
+static inline dim_t simple_mod(const dim_t i, const dim_t dim)
+{
+    return (i < dim) ? i : (i - dim);
+}
+
+template<typename T>
+void shift(Array<T> out, const Array<T> in, const af::dim4 sdims)
+{
+    T* outPtr = out.get();
+    const T* inPtr = in.get();
+
+    const af::dim4 oDims = out.dims();
+    const af::dim4 ist   = in.strides();
+    const af::dim4 ost   = out.strides();
+
+    int sdims_[4];
+    // Need to do this because we are mapping output to input in the kernel
+    for(int i = 0; i < 4; i++) {
+        // sdims_[i] will always be positive and always [0, oDims[i]].
+        // Negative shifts are converted to position by going the other way round
+        sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0);
+        assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]);
+    }
+
+    for(dim_t ow = 0; ow < oDims[3]; ow++) {
+        const int oW = ow * ost[3];
+        const int iw = simple_mod((ow + sdims_[3]), oDims[3]);
+        const int iW = iw * ist[3];
+        for(dim_t oz = 0; oz < oDims[2]; oz++) {
+            const int oZW = oW + oz * ost[2];
+            const int iz = simple_mod((oz + sdims_[2]), oDims[2]);
+            const int iZW = iW + iz * ist[2];
+            for(dim_t oy = 0; oy < oDims[1]; oy++) {
+                const int oYZW = oZW + oy * ost[1];
+                const int iy = simple_mod((oy + sdims_[1]), oDims[1]);
+                const int iYZW = iZW + iy * ist[1];
+                for(dim_t ox = 0; ox < oDims[0]; ox++) {
+                    const int oIdx = oYZW + ox;
+                    const int ix = simple_mod((ox + sdims_[0]), oDims[0]);
+                    const int iIdx = iYZW + ix;
+
+                    outPtr[oIdx] = inPtr[iIdx];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/sift_nonfree.hpp b/src/backend/cpu/kernel/sift_nonfree.hpp
new file mode 100644
index 0000000000..e7ca19175c
--- /dev/null
+++ b/src/backend/cpu/kernel/sift_nonfree.hpp
@@ -0,0 +1,1196 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+// The source code contained in this file is based on the original code by
+// Rob Hess. Please note that SIFT is an algorithm patented and protected
+// by US law, before using this code or any binary forms generated from it,
+// verify that you have permission to do so. The original license by Rob Hess
+// can be read below:
+//
+// Copyright (c) 2006-2012, Rob Hess <rob@iqengines.com>
+// All rights reserved.
+//
+// The following patent has been issued for methods embodied in this
+// software: "Method and apparatus for identifying scale invariant features
+// in an image and use of same for locating an object in an image," David
+// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
+// filed March 8, 1999. Asignee: The University of British Columbia. For
+// further details, contact David Lowe (lowe@cs.ubc.ca) or the
+// University-Industry Liaison Office of the University of British
+// Columbia.
+//
+// Note that restrictions imposed by this patent (and possibly others)
+// exist independently of and may be in conflict with the freedoms granted
+// in this license, which refers to copyright of the program, not patents
+// for any methods that it implements.  Both copyright and patent law must
+// be obeyed to legally use and redistribute this program and it is not the
+// purpose of this license to induce you to infringe any patents or other
+// property right claims or to contest validity of any such claims.  If you
+// redistribute or use the program, then this license merely protects you
+// from committing copyright infringement.  It does not protect you from
+// committing patent infringement.  So, before you do anything with this
+// program, make sure that you have permission to do so not merely in terms
+// of copyright, but also in terms of patent law.
+//
+// Please note that this license is not to be understood as a guarantee
+// either.  If you use the program according to this license, but in
+// conflict with patent law, it does not mean that the licensor will refund
+// you for any losses that you incur if you are sued for your patent
+// infringement.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//     * Redistributions of source code must retain the above copyright and
+//       patent notices, this list of conditions and the following
+//       disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in
+//       the documentation and/or other materials provided with the
+//       distribution.
+//     * Neither the name of Oregon State University nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+using af::dim4;
+
+namespace cpu
+{
+
+static const float PI_VAL = 3.14159265358979323846f;
+
+// default width of descriptor histogram array
+static const int DescrWidth = 4;
+
+// default number of bins per histogram in descriptor array
+static const int DescrHistBins = 8;
+
+// assumed gaussian blur for input image
+static const float InitSigma = 0.5f;
+
+// width of border in which to ignore keypoints
+static const int ImgBorder = 5;
+
+// maximum steps of keypoint interpolation before failure
+static const int MaxInterpSteps = 5;
+
+// default number of bins in histogram for orientation assignment
+static const int OriHistBins = 36;
+
+// determines gaussian sigma for orientation assignment
+static const float OriSigFctr = 1.5f;
+
+// determines the radius of the region used in orientation assignment */
+static const float OriRadius = 3.0f * OriSigFctr;
+
+// number of passes of orientation histogram smoothing
+static const int SmoothOriPasses = 2;
+
+// orientation magnitude relative to max that results in new feature
+static const float OriPeakRatio = 0.8f;
+
+// determines the size of a single descriptor orientation histogram
+static const float DescrSclFctr = 3.f;
+
+// threshold on magnitude of elements of descriptor vector
+static const float DescrMagThr = 0.2f;
+
+// factor used to convert floating-point descriptor to unsigned char
+static const float IntDescrFctr = 512.f;
+
+// Number of GLOH bins in radial direction
+static const unsigned GLOHRadialBins = 3;
+
+// Radiuses of GLOH descriptors
+static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
+
+// Number of GLOH angular bins (excluding the inner-most radial section)
+static const unsigned GLOHAngularBins = 8;
+
+// Number of GLOH bins per histogram in descriptor
+static const unsigned GLOHHistBins = 16;
+
+typedef struct
+{
+    float    f[4];
+    unsigned l;
+} feat_t;
+
+bool feat_cmp(feat_t i, feat_t j)
+{
+    for (int k = 0; k < 4; k++)
+        if (i.f[k] != j.f[k])
+            return (i.f[k] < j.f[k]);
+    if (i.l != j.l)
+        return (i.l < j.l);
+
+    return true;
+}
+
+void array_to_feat(std::vector<feat_t>& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat)
+{
+    feat.resize(nfeat);
+    for (unsigned i = 0; i < feat.size(); i++) {
+        feat[i].f[0] = x[i];
+        feat[i].f[1] = y[i];
+        feat[i].f[2] = resp[i];
+        feat[i].f[3] = size[i];
+        feat[i].l    = layer[i];
+    }
+}
+
+template<typename T>
+void gaussian1D(T* out, const int dim, double sigma=0.0)
+{
+    if(!(sigma>0)) sigma = 0.25*dim;
+
+    T sum = (T)0;
+    for(int i=0;i<dim;i++)
+    {
+        int x = i-(dim-1)/2;
+        T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
+        out[i] = el;
+        sum   += el;
+    }
+
+    for(int k=0;k<dim;k++)
+        out[k] /= sum;
+}
+
+template<typename T>
+Array<T> gauss_filter(float sigma)
+{
+    // Using 6-sigma rule
+    unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
+
+    Array<T> filter = createEmptyArray<T>(gauss_len);
+    gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma);
+
+    return filter;
+}
+
+template<int N>
+void gaussianElimination(float* A, float* b, float* x)
+{
+    // forward elimination
+    for (int i = 0; i < N-1; i++) {
+        for (int j = i+1; j < N; j++) {
+            float s = A[j*N+i] / A[i*N+i];
+
+            for (int k = i; k < N; k++)
+                A[j*N+k] -= s * A[i*N+k];
+
+            b[j] -= s * b[i];
+        }
+    }
+
+    for (int i = 0; i < N; i++)
+        x[i] = 0;
+
+    // backward substitution
+    float sum = 0;
+    for (int i = 0; i <= N-2; i++) {
+        sum = b[i];
+        for (int j = i+1; j < N; j++)
+            sum -= A[i*N+j] * x[j];
+        x[i] = sum / A[i*N+i];
+    }
+}
+
+template<typename T>
+void sub(
+    Array<T>& out,
+    const Array<T>& in1,
+    const Array<T>& in2)
+{
+    size_t nel = in1.elements();
+    T* out_ptr = out.get();
+    const T* in1_ptr = in1.get();
+    const T* in2_ptr = in2.get();
+
+    for (size_t i = 0; i < nel; i++) {
+        out_ptr[i] = in1_ptr[i] - in2_ptr[i];
+    }
+}
+
+#define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)])
+#define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)])
+#define NPTR(Y, X) (next_ptr[(Y) * idims[0] + (X)])
+
+// Determines whether a pixel is a scale-space extremum by comparing it to its
+// 3x3x3 pixel neighborhood.
+template<typename T>
+void detectExtrema(
+    float* x_out,
+    float* y_out,
+    unsigned* layer_out,
+    unsigned* counter,
+    const Array<T>& prev,
+    const Array<T>& center,
+    const Array<T>& next,
+    const unsigned layer,
+    const unsigned max_feat,
+    const float threshold)
+{
+    const af::dim4 idims = center.dims();
+    const T* prev_ptr    = prev.get();
+    const T* center_ptr  = center.get();
+    const T* next_ptr    = next.get();
+
+    for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) {
+        for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) {
+            float p = center_ptr[y*idims[0] + x];
+
+            // Find extrema
+            if (abs((float)p) > threshold &&
+                ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) &&
+                  p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y,   x+1)  &&
+                  p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1)  &&
+                  p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1)  &&
+                  p > PPTR(y,   x-1) && p > PPTR(y  , x) && p > PPTR(y,   x+1)  &&
+                  p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1)  &&
+                  p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1)  &&
+                  p > NPTR(y,   x-1) && p > NPTR(y  , x) && p > NPTR(y,   x+1)  &&
+                  p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) ||
+                 (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) &&
+                  p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y,   x+1)  &&
+                  p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1)  &&
+                  p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1)  &&
+                  p < PPTR(y,   x-1) && p < PPTR(y  , x) && p < PPTR(y,   x+1)  &&
+                  p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1)  &&
+                  p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1)  &&
+                  p < NPTR(y,   x-1) && p < NPTR(y  , x) && p < NPTR(y,   x+1)  &&
+                  p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) {
+
+                if (*counter < max_feat)
+                {
+                    x_out[*counter] = (float)y;
+                    y_out[*counter] = (float)x;
+                    layer_out[*counter] = layer;
+                    (*counter)++;
+                }
+            }
+        }
+    }
+}
+
+// Interpolates a scale-space extremum's location and scale to subpixel
+// accuracy to form an image feature. Rejects features with low contrast.
+// Based on Section 4 of Lowe's paper.
+template<typename T>
+void interpolateExtrema(
+    float* x_out,
+    float* y_out,
+    unsigned* layer_out,
+    float* response_out,
+    float* size_out,
+    unsigned* counter,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const unsigned extrema_feat,
+    std::vector< Array<T> >& dog_pyr,
+    const unsigned max_feat,
+    const unsigned octave,
+    const unsigned n_layers,
+    const float contrast_thr,
+    const float edge_thr,
+    const float sigma,
+    const float img_scale)
+{
+    for (int f = 0; f < (int)extrema_feat; f++) {
+        const float first_deriv_scale = img_scale*0.5f;
+        const float second_deriv_scale = img_scale;
+        const float cross_deriv_scale = img_scale*0.25f;
+
+        float xl = 0, xy = 0, xx = 0, contr = 0;
+        int i = 0;
+
+        unsigned x = x_in[f];
+        unsigned y = y_in[f];
+        unsigned layer = layer_in[f];
+
+        const T* prev_ptr   = dog_pyr[octave*(n_layers+2) + layer-1].get();
+        const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get();
+        const T* next_ptr   = dog_pyr[octave*(n_layers+2) + layer+1].get();
+
+        af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims();
+
+        bool converges = true;
+
+        for (i = 0; i < MaxInterpSteps; i++) {
+            float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
+                           (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
+                           (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
+
+            float d2  = CPTR(x, y) * 2.f;
+            float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
+            float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
+            float dss = (NPTR(x, y  ) + PPTR(x, y  ) - d2) * second_deriv_scale;
+            float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
+                         CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
+            float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
+                         PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
+            float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
+                         PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
+
+            float H[9] = {dxx, dxy, dxs,
+                          dxy, dyy, dys,
+                          dxs, dys, dss};
+
+            float X[3];
+            gaussianElimination<3>(H, dD, X);
+
+            xl = -X[2];
+            xy = -X[1];
+            xx = -X[0];
+
+            if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
+                break;
+
+            x += round(xx);
+            y += round(xy);
+            layer += round(xl);
+
+            if (layer < 1 || layer > n_layers ||
+                x < ImgBorder || x >= idims[1] - ImgBorder ||
+                y < ImgBorder || y >= idims[0] - ImgBorder) {
+                converges = false;
+                break;
+            }
+        }
+
+        // ensure convergence of interpolation
+        if (i >= MaxInterpSteps || !converges)
+            continue;
+
+        float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
+                       (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
+                       (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
+        float X[3] = {xx, xy, xl};
+
+        float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
+
+        contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f;
+        if(abs(contr) < (contrast_thr / n_layers))
+            continue;
+
+        // principal curvatures are computed using the trace and det of Hessian
+        float d2  = CPTR(x, y) * 2.f;
+        float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
+        float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
+        float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
+                     CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
+
+        float tr = dxx + dyy;
+        float det = dxx * dyy - dxy * dxy;
+
+        // add FLT_EPSILON for double-precision compatibility
+        if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
+            continue;
+
+        if (*counter < max_feat)
+        {
+            x_out[*counter] = (x + xx) * (1 << octave);
+            y_out[*counter] = (y + xy) * (1 << octave);
+            layer_out[*counter] = layer;
+            response_out[*counter] = abs(contr);
+            size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f;
+            (*counter)++;
+        }
+    }
+}
+
+#undef CPTR
+#undef PPTR
+#undef NPTR
+
+// Remove duplicate keypoints
+void removeDuplicates(
+    float* x_out,
+    float* y_out,
+    unsigned* layer_out,
+    float* response_out,
+    float* size_out,
+    unsigned* counter,
+    const std::vector<feat_t>& sorted_feat)
+{
+    size_t nfeat = sorted_feat.size();
+
+    for (size_t f = 0; f < nfeat; f++) {
+        float prec_fctr = 1e4f;
+
+        if (f < nfeat-1) {
+            if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) &&
+                round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) &&
+                round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) &&
+                round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) &&
+                sorted_feat[f].l == sorted_feat[f+1].l)
+                continue;
+        }
+
+        x_out[*counter] = sorted_feat[f].f[0];
+        y_out[*counter] = sorted_feat[f].f[1];
+        response_out[*counter] = sorted_feat[f].f[2];
+        size_out[*counter] = sorted_feat[f].f[3];
+        layer_out[*counter] = sorted_feat[f].l;
+        (*counter)++;
+    }
+}
+
+#define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)])
+
+// Computes a canonical orientation for each image feature in an array.  Based
+// on Section 5 of Lowe's paper.  This function adds features to the array when
+// there is more than one dominant orientation at a given feature location.
+template<typename T>
+void calcOrientation(
+    float* x_out,
+    float* y_out,
+    unsigned* layer_out,
+    float* response_out,
+    float* size_out,
+    float* ori_out,
+    unsigned* counter,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const float* response_in,
+    const float* size_in,
+    const unsigned total_feat,
+    const std::vector< Array<T> >& gauss_pyr,
+    const unsigned max_feat,
+    const unsigned octave,
+    const unsigned n_layers,
+    const bool double_input)
+{
+    const int n = OriHistBins;
+
+    float hist[OriHistBins];
+    float temphist[OriHistBins];
+
+    for (unsigned f = 0; f < total_feat; f++) {
+        // Load keypoint information
+        const float real_x = x_in[f];
+        const float real_y = y_in[f];
+        const unsigned layer = layer_in[f];
+        const float response = response_in[f];
+        const float size = size_in[f];
+
+        const int pt_x = (int)round(real_x / (1 << octave));
+        const int pt_y = (int)round(real_y / (1 << octave));
+
+        // Calculate auxiliary parameters
+        const float scl_octv = size*0.5f / (1 << octave);
+        const int radius = (int)round(OriRadius * scl_octv);
+        const float sigma = OriSigFctr * scl_octv;
+        const int len = (radius*2+1);
+        const float exp_denom = 2.f * sigma * sigma;
+
+        // Points img to correct Gaussian pyramid layer
+        const Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+        const T* img_ptr = img.get();
+
+        for (int i = 0; i < OriHistBins; i++)
+            hist[i] = 0.f;
+
+        af::dim4 idims = img.dims();
+
+        // Calculate orientation histogram
+        for (int l = 0; l < len*len; l++) {
+            int i = l / len - radius;
+            int j = l % len - radius;
+
+            int y = pt_y + i;
+            int x = pt_x + j;
+            if (y < 1 || y >= idims[0] - 1 ||
+                x < 1 || x >= idims[1] - 1)
+                continue;
+
+            float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+            float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+            float mag = sqrt(dx*dx+dy*dy);
+            float ori = atan2(dy,dx);
+            float w = exp(-(i*i + j*j)/exp_denom);
+
+            int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
+            bin = bin < n ? bin : 0;
+
+            hist[bin] += w*mag;
+        }
+
+        for (int i = 0; i < SmoothOriPasses; i++) {
+            for (int j = 0; j < n; j++) {
+                temphist[j] = hist[j];
+            }
+            for (int j = 0; j < n; j++) {
+                float prev = (j == 0) ? temphist[n-1] : temphist[j-1];
+                float next = (j+1 == n) ? temphist[0] : temphist[j+1];
+                hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next;
+            }
+        }
+
+        float omax = hist[0];
+        for (int i = 1; i < n; i++)
+            omax = max(omax, hist[i]);
+
+        float mag_thr = (float)(omax * OriPeakRatio);
+        int l, r;
+        for (int j = 0; j < n; j++) {
+            l = (j == 0) ? n - 1 : j - 1;
+            r = (j + 1) % n;
+            if (hist[j] > hist[l] &&
+                hist[j] > hist[r] &&
+                hist[j] >= mag_thr) {
+                if (*counter < max_feat) {
+                    float bin = j + 0.5f * (hist[l] - hist[r]) /
+                        (hist[l] - 2.0f*hist[j] + hist[r]);
+                    bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
+                    float ori = 360.f - ((360.f/n) * bin);
+
+                    float new_real_x = real_x;
+                    float new_real_y = real_y;
+                    float new_size = size;
+
+                    if (double_input) {
+                        float scale = 0.5f;
+                        new_real_x *= scale;
+                        new_real_y *= scale;
+                        new_size *= scale;
+                    }
+
+                    x_out[*counter] = new_real_x;
+                    y_out[*counter] = new_real_y;
+                    layer_out[*counter] = layer;
+                    response_out[*counter] = response;
+                    size_out[*counter] = new_size;
+                    ori_out[*counter] = ori;
+                    (*counter)++;
+                }
+            }
+        }
+    }
+}
+
+void normalizeDesc(
+    float* desc,
+    const int histlen)
+{
+    float len_sq = 0.0f;
+
+    for (int i = 0; i < histlen; i++)
+        len_sq += desc[i] * desc[i];
+
+    float len_inv = 1.0f / sqrt(len_sq);
+
+    for (int i = 0; i < histlen; i++) {
+        desc[i] *= len_inv;
+    }
+}
+
+// Computes feature descriptors for features in an array.  Based on Section 6
+// of Lowe's paper.
+template<typename T>
+void computeDescriptor(
+    float* desc_out,
+    const unsigned desc_len,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const float* response_in,
+    const float* size_in,
+    const float* ori_in,
+    const unsigned total_feat,
+    const std::vector< Array<T> >& gauss_pyr,
+    const int d,
+    const int n,
+    const float scale,
+    const unsigned octave,
+    const unsigned n_layers)
+{
+    float desc[128];
+
+    for (unsigned f = 0; f < total_feat; f++) {
+        const unsigned layer = layer_in[f];
+        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+        const float size = size_in[f];
+        const int fx = round(x_in[f] * scale);
+        const int fy = round(y_in[f] * scale);
+
+        // Points img to correct Gaussian pyramid layer
+        Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+        const T* img_ptr = img.get();
+        af::dim4 idims = img.dims();
+
+        float cos_t = cos(ori);
+        float sin_t = sin(ori);
+        float bins_per_rad = n / (PI_VAL * 2.f);
+        float exp_denom = d * d * 0.5f;
+        float hist_width = DescrSclFctr * size * scale * 0.5f;
+        int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
+
+        int len = radius*2+1;
+
+        for (int i = 0; i < (int)desc_len; i++)
+            desc[i] = 0.f;
+
+        // Calculate orientation histogram
+        for (int l = 0; l < len*len; l++) {
+            int i = l / len - radius;
+            int j = l % len - radius;
+
+            int y = fy + i;
+            int x = fx + j;
+
+            float x_rot = (j * cos_t - i * sin_t) / hist_width;
+            float y_rot = (j * sin_t + i * cos_t) / hist_width;
+            float xbin = x_rot + d/2 - 0.5f;
+            float ybin = y_rot + d/2 - 0.5f;
+
+            if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
+                y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
+                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+                float grad_mag = sqrt(dx*dx + dy*dy);
+                float grad_ori = atan2(dy, dx) - ori;
+                while (grad_ori < 0.0f)
+                    grad_ori += PI_VAL*2;
+                while (grad_ori >= PI_VAL*2)
+                    grad_ori -= PI_VAL*2;
+
+                float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
+                float obin = grad_ori * bins_per_rad;
+                float mag = grad_mag*w;
+
+                int x0 = floor(xbin);
+                int y0 = floor(ybin);
+                int o0 = floor(obin);
+                xbin -= x0;
+                ybin -= y0;
+                obin -= o0;
+
+                for (int yl = 0; yl <= 1; yl++) {
+                    int yb = y0 + yl;
+                    if (yb >= 0 && yb < d) {
+                        float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
+                        for (int xl = 0; xl <= 1; xl++) {
+                            int xb = x0 + xl;
+                            if (xb >= 0 && xb < d) {
+                                float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
+                                for (int ol = 0; ol <= 1; ol++) {
+                                    int ob = (o0 + ol) % n;
+                                    float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
+                                    desc[(yb*d + xb)*n + ob] += v_o;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        normalizeDesc(desc, desc_len);
+
+        for (int i = 0; i < (int)desc_len; i++)
+            desc[i] = min(desc[i], DescrMagThr);
+
+        normalizeDesc(desc, desc_len);
+
+        // Calculate final descriptor values
+        for (int k = 0; k < (int)desc_len; k++) {
+            desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
+        }
+    }
+}
+
+// Computes GLOH feature descriptors for features in an array. Based on Section III-B
+// of Mikolajczyk and Schmid paper.
+template<typename T>
+void computeGLOHDescriptor(
+    float* desc_out,
+    const unsigned desc_len,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const float* response_in,
+    const float* size_in,
+    const float* ori_in,
+    const unsigned total_feat,
+    const std::vector< Array<T> >& gauss_pyr,
+    const int d,
+    const unsigned rb,
+    const unsigned ab,
+    const unsigned hb,
+    const float scale,
+    const unsigned octave,
+    const unsigned n_layers)
+{
+    float desc[272];
+
+    for (unsigned f = 0; f < total_feat; f++) {
+        const unsigned layer = layer_in[f];
+        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+        const float size = size_in[f];
+        const int fx = round(x_in[f] * scale);
+        const int fy = round(y_in[f] * scale);
+
+        // Points img to correct Gaussian pyramid layer
+        Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+        const T* img_ptr = img.get();
+        af::dim4 idims = img.dims();
+
+        float cos_t = cos(ori);
+        float sin_t = sin(ori);
+        float hist_bins_per_rad = hb / (PI_VAL * 2.f);
+        float polar_bins_per_rad = ab / (PI_VAL * 2.f);
+        float exp_denom = GLOHRadii[rb-1] * 0.5f;
+
+        float hist_width = DescrSclFctr * size * scale * 0.5f;
+
+        // Keep same descriptor radius used for SIFT
+        int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
+
+        // Alternative radius size calculation, changing the radius weight
+        // (rw) in the range of 0.25f-0.75f gives different results,
+        // increasing it tends to show a better recall rate but with a
+        // smaller amount of correct matches
+        //float rw = 0.5f;
+        //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
+
+        int len = radius*2+1;
+
+        for (int i = 0; i < (int)desc_len; i++)
+            desc[i] = 0.f;
+
+        // Calculate orientation histogram
+        for (int l = 0; l < len*len; l++) {
+            int i = l / len - radius;
+            int j = l % len - radius;
+
+            int y = fy + i;
+            int x = fx + j;
+
+            float x_rot = (j * cos_t - i * sin_t);
+            float y_rot = (j * sin_t + i * cos_t);
+
+            float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
+            float theta = atan2(y_rot, x_rot);
+            while (theta < 0.0f)
+                theta += PI_VAL*2;
+            while (theta >= PI_VAL*2)
+                theta -= PI_VAL*2;
+
+            float tbin = theta * polar_bins_per_rad;
+            float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
+                         ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
+                         min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
+
+            if (r <= GLOHRadii[rb-1] &&
+                y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
+                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+                float grad_mag = sqrt(dx*dx + dy*dy);
+                float grad_ori = atan2(dy, dx) - ori;
+                while (grad_ori < 0.0f)
+                    grad_ori += PI_VAL*2;
+                while (grad_ori >= PI_VAL*2)
+                    grad_ori -= PI_VAL*2;
+
+                float w = exp(-r / exp_denom);
+                float obin = grad_ori * hist_bins_per_rad;
+                float mag = grad_mag*w;
+
+                int t0 = floor(tbin);
+                int r0 = floor(rbin);
+                int o0 = floor(obin);
+                tbin -= t0;
+                rbin -= r0;
+                obin -= o0;
+
+                for (int rl = 0; rl <= 1; rl++) {
+                    int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
+                    float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
+                    if (rb >= 0 && rb <= 2) {
+                        for (int tl = 0; tl <= 1; tl++) {
+                            int tb = (t0 + tl) % ab;
+                            float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+                            for (int ol = 0; ol <= 1; ol++) {
+                                int ob = (o0 + ol) % hb;
+                                float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
+                                unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
+                                desc[idx] += v_o;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        normalizeDesc(desc, desc_len);
+
+        for (int i = 0; i < (int)desc_len; i++)
+            desc[i] = min(desc[i], DescrMagThr);
+
+        normalizeDesc(desc, desc_len);
+
+        // Calculate final descriptor values
+        for (int k = 0; k < (int)desc_len; k++) {
+            desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
+        }
+    }
+}
+
+#undef IPTR
+
+template<typename T, typename convAccT>
+Array<T> createInitialImage(
+    const Array<T>& img,
+    const float init_sigma,
+    const bool double_input)
+{
+    af::dim4 idims = img.dims();
+
+    Array<T> init_img = createEmptyArray<T>(af::dim4());
+
+    float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f)
+                             : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f);
+
+    Array<T> filter = gauss_filter<T>(s);
+
+    if (double_input) {
+        Array<T> double_img = resize<T>(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR);
+        init_img = convolve2<T, convAccT, false>(double_img, filter, filter);
+    }
+    else {
+        init_img = convolve2<T, convAccT, false>(img, filter, filter);
+    }
+
+    return init_img;
+}
+
+template<typename T, typename convAccT>
+std::vector< Array<T> > buildGaussPyr(
+    const Array<T>& init_img,
+    const unsigned n_octaves,
+    const unsigned n_layers,
+    const float init_sigma)
+{
+    // Precompute Gaussian sigmas using the following formula:
+    // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
+    std::vector<float> sig_layers(n_layers + 3);
+    sig_layers[0] = init_sigma;
+    float k = std::pow(2.0f, 1.0f / n_layers);
+    for (unsigned i = 1; i < n_layers + 3; i++) {
+        float sig_prev = std::pow(k, i-1) * init_sigma;
+        float sig_total = sig_prev * k;
+        sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
+    }
+
+    // Gaussian Pyramid
+    std::vector< Array<T> > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray<T>(af::dim4()));
+    for (unsigned o = 0; o < n_octaves; o++) {
+        for (unsigned l = 0; l < n_layers+3; l++) {
+            unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
+            unsigned idx = o*(n_layers+3) + l;
+
+            if (o == 0 && l == 0) {
+                gauss_pyr[idx] = init_img;
+            }
+            else if (l == 0) {
+                af::dim4 sdims = gauss_pyr[src_idx].dims();
+                gauss_pyr[idx] = resize<T>(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR);
+            }
+            else {
+                Array<T> filter = gauss_filter<T>(sig_layers[l]);
+
+                gauss_pyr[idx] = convolve2<T, convAccT, false>(gauss_pyr[src_idx], filter, filter);
+            }
+        }
+    }
+
+    return gauss_pyr;
+}
+
+template<typename T>
+std::vector< Array<T> > buildDoGPyr(
+    std::vector< Array<T> >& gauss_pyr,
+    const unsigned n_octaves,
+    const unsigned n_layers)
+{
+    // DoG Pyramid
+    std::vector< Array<T> > dog_pyr(n_octaves * (n_layers+2), createEmptyArray<T>(af::dim4()));
+    for (unsigned o = 0; o < n_octaves; o++) {
+        for (unsigned l = 0; l < n_layers+2; l++) {
+            unsigned idx    = o*(n_layers+2) + l;
+            unsigned bottom = o*(n_layers+3) + l;
+            unsigned top    = o*(n_layers+3) + l+1;
+
+            dog_pyr[idx] = createEmptyArray<T>(gauss_pyr[bottom].dims());
+
+            sub<T>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
+        }
+    }
+
+    return dog_pyr;
+}
+
+
+template<typename T, typename convAccT>
+unsigned sift_impl(Array<float>& x, Array<float>& y, Array<float>& score,
+                   Array<float>& ori, Array<float>& size, Array<float>& desc,
+                   const Array<T>& in, const unsigned n_layers,
+                   const float contrast_thr, const float edge_thr,
+                   const float init_sigma, const bool double_input,
+                   const float img_scale, const float feature_ratio,
+                   const bool compute_GLOH)
+{
+    in.eval();
+    getQueue().sync();
+    af::dim4 idims = in.dims();
+
+    const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2)
+        : min(idims[0], idims[1]);
+    const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
+
+    Array<T> init_img = createInitialImage<T, convAccT>(in, init_sigma, double_input);
+
+    std::vector< Array<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
+
+    std::vector< Array<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
+
+    std::vector<float*> x_pyr(n_octaves, NULL);
+    std::vector<float*> y_pyr(n_octaves, NULL);
+    std::vector<float*> response_pyr(n_octaves, NULL);
+    std::vector<float*> size_pyr(n_octaves, NULL);
+    std::vector<float*> ori_pyr(n_octaves, NULL);
+    std::vector<float*> desc_pyr(n_octaves, NULL);
+    std::vector<unsigned> feat_pyr(n_octaves, 0);
+    unsigned total_feat = 0;
+
+    const unsigned d = DescrWidth;
+    const unsigned n = DescrHistBins;
+    const unsigned rb = GLOHRadialBins;
+    const unsigned ab = GLOHAngularBins;
+    const unsigned hb = GLOHHistBins;
+    const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
+
+    for (unsigned i = 0; i < n_octaves; i++) {
+        af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
+        if (ddims[0]-2*ImgBorder < 1 ||
+            ddims[1]-2*ImgBorder < 1)
+            continue;
+
+        const unsigned imel = ddims[0] * ddims[1];
+        const unsigned max_feat = ceil(imel * feature_ratio);
+
+        float* extrema_x = memAlloc<float>(max_feat);
+        float* extrema_y = memAlloc<float>(max_feat);
+        unsigned* extrema_layer = memAlloc<unsigned>(max_feat);
+        unsigned extrema_feat = 0;
+
+        for (unsigned j = 1; j <= n_layers; j++) {
+            unsigned prev   = i*(n_layers+2) + j-1;
+            unsigned center = i*(n_layers+2) + j;
+            unsigned next   = i*(n_layers+2) + j+1;
+
+            unsigned layer = j;
+
+            float extrema_thr = 0.5f * contrast_thr / n_layers;
+            detectExtrema<T>(extrema_x, extrema_y, extrema_layer, &extrema_feat,
+                             dog_pyr[prev], dog_pyr[center], dog_pyr[next],
+                             layer, max_feat, extrema_thr);
+        }
+
+        extrema_feat = min(extrema_feat, max_feat);
+
+        if (extrema_feat == 0) {
+            memFree(extrema_x);
+            memFree(extrema_y);
+            memFree(extrema_layer);
+
+            continue;
+        }
+
+        unsigned interp_feat = 0;
+
+        float* interp_x = memAlloc<float>(extrema_feat);
+        float* interp_y = memAlloc<float>(extrema_feat);
+        unsigned* interp_layer = memAlloc<unsigned>(extrema_feat);
+        float* interp_response = memAlloc<float>(extrema_feat);
+        float* interp_size = memAlloc<float>(extrema_feat);
+
+        interpolateExtrema<T>(interp_x, interp_y, interp_layer,
+                              interp_response, interp_size, &interp_feat,
+                              extrema_x, extrema_y, extrema_layer, extrema_feat,
+                              dog_pyr, max_feat, i, n_layers,
+                              contrast_thr, edge_thr, init_sigma, img_scale);
+
+        interp_feat = min(interp_feat, max_feat);
+
+        if (interp_feat == 0) {
+            memFree(interp_x);
+            memFree(interp_y);
+            memFree(interp_layer);
+            memFree(interp_response);
+            memFree(interp_size);
+
+            continue;
+        }
+
+        std::vector<feat_t> sorted_feat;
+        array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
+        std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
+
+        memFree(interp_x);
+        memFree(interp_y);
+        memFree(interp_layer);
+        memFree(interp_response);
+        memFree(interp_size);
+
+        unsigned nodup_feat = 0;
+
+        float* nodup_x = memAlloc<float>(interp_feat);
+        float* nodup_y = memAlloc<float>(interp_feat);
+        unsigned* nodup_layer = memAlloc<unsigned>(interp_feat);
+        float* nodup_response = memAlloc<float>(interp_feat);
+        float* nodup_size = memAlloc<float>(interp_feat);
+
+        removeDuplicates(nodup_x, nodup_y, nodup_layer,
+                         nodup_response, nodup_size, &nodup_feat,
+                         sorted_feat);
+
+        const unsigned max_oriented_feat = nodup_feat * 3;
+
+        float* oriented_x = memAlloc<float>(max_oriented_feat);
+        float* oriented_y = memAlloc<float>(max_oriented_feat);
+        unsigned* oriented_layer = memAlloc<unsigned>(max_oriented_feat);
+        float* oriented_response = memAlloc<float>(max_oriented_feat);
+        float* oriented_size = memAlloc<float>(max_oriented_feat);
+        float* oriented_ori = memAlloc<float>(max_oriented_feat);
+
+        unsigned oriented_feat = 0;
+
+        calcOrientation<T>(oriented_x, oriented_y, oriented_layer,
+                           oriented_response, oriented_size, oriented_ori, &oriented_feat,
+                           nodup_x, nodup_y, nodup_layer,
+                           nodup_response, nodup_size, nodup_feat,
+                           gauss_pyr, max_oriented_feat, i, n_layers, double_input);
+
+        memFree(nodup_x);
+        memFree(nodup_y);
+        memFree(nodup_layer);
+        memFree(nodup_response);
+        memFree(nodup_size);
+
+        if (oriented_feat == 0) {
+            memFree(oriented_x);
+            memFree(oriented_y);
+            memFree(oriented_layer);
+            memFree(oriented_response);
+            memFree(oriented_size);
+            memFree(oriented_ori);
+
+            continue;
+        }
+
+        float* desc = memAlloc<float>(oriented_feat * desc_len);
+
+        float scale = 1.f/(1 << i);
+        if (double_input) scale *= 2.f;
+
+        if (compute_GLOH)
+            computeGLOHDescriptor<T>(desc, desc_len,
+                                     oriented_x, oriented_y, oriented_layer,
+                                     oriented_response, oriented_size, oriented_ori,
+                                     oriented_feat, gauss_pyr, d, rb, ab, hb,
+                                     scale, i, n_layers);
+        else
+            computeDescriptor<T>(desc, desc_len,
+                                 oriented_x, oriented_y, oriented_layer,
+                                 oriented_response, oriented_size, oriented_ori,
+                                 oriented_feat, gauss_pyr, d, n, scale, i, n_layers);
+
+        total_feat += oriented_feat;
+        feat_pyr[i] = oriented_feat;
+
+        if (oriented_feat > 0) {
+            x_pyr[i] = oriented_x;
+            y_pyr[i] = oriented_y;
+            response_pyr[i] = oriented_response;
+            ori_pyr[i] = oriented_ori;
+            size_pyr[i] = oriented_size;
+            desc_pyr[i] = desc;
+        }
+    }
+
+    if (total_feat > 0) {
+        const af::dim4 total_feat_dims(total_feat);
+        const af::dim4 desc_dims(desc_len, total_feat);
+
+        // Allocate output memory
+        x     = createEmptyArray<float>(total_feat_dims);
+        y     = createEmptyArray<float>(total_feat_dims);
+        score = createEmptyArray<float>(total_feat_dims);
+        ori   = createEmptyArray<float>(total_feat_dims);
+        size  = createEmptyArray<float>(total_feat_dims);
+        desc  = createEmptyArray<float>(desc_dims);
+
+        float* x_ptr = x.get();
+        float* y_ptr = y.get();
+        float* score_ptr = score.get();
+        float* ori_ptr = ori.get();
+        float* size_ptr = size.get();
+        float* desc_ptr = desc.get();
+
+        unsigned offset = 0;
+        for (unsigned i = 0; i < n_octaves; i++) {
+            if (feat_pyr[i] == 0)
+                continue;
+
+            memcpy(x_ptr+offset,     x_pyr[i],        feat_pyr[i] * sizeof(float));
+            memcpy(y_ptr+offset,     y_pyr[i],        feat_pyr[i] * sizeof(float));
+            memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float));
+            memcpy(ori_ptr+offset,   ori_pyr[i],      feat_pyr[i] * sizeof(float));
+            memcpy(size_ptr+offset,  size_pyr[i],     feat_pyr[i] * sizeof(float));
+
+            memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float));
+
+            memFree(x_pyr[i]);
+            memFree(y_pyr[i]);
+            memFree(response_pyr[i]);
+            memFree(ori_pyr[i]);
+            memFree(size_pyr[i]);
+            memFree(desc_pyr[i]);
+
+            offset += feat_pyr[i];
+        }
+    }
+
+    return total_feat;
+}
+
+}
diff --git a/src/backend/cpu/kernel/sobel.hpp b/src/backend/cpu/kernel/sobel.hpp
new file mode 100644
index 0000000000..49d33cdbb4
--- /dev/null
+++ b/src/backend/cpu/kernel/sobel.hpp
@@ -0,0 +1,86 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <cassert>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename Ti, typename To, bool isDX>
+void derivative(Array<To> output, const Array<Ti> input)
+{
+    const af::dim4 dims    = input.dims();
+    const af::dim4 strides = input.strides();
+          To* optr     = output.get();
+    const Ti* iptr     = input.get();
+
+    for(dim_t b3=0; b3<dims[3]; ++b3) {
+    for(dim_t b2=0; b2<dims[2]; ++b2) {
+
+        for(dim_t j=0; j<dims[1]; ++j) {
+
+            int joff  = j;
+            int _joff = j-1;
+            int joff_ = j+1;
+            int joffset = j*strides[1];
+
+            for(dim_t i=0; i<dims[0]; ++i) {
+
+                To accum = To(0);
+
+                int  ioff = i;
+                int _ioff = i-1;
+                int ioff_ = i+1;
+
+                To NW = (_ioff>=0 && _joff>=0) ?
+                        iptr[_joff*strides[1]+_ioff*strides[0]] : 0;
+                To SW = (ioff_<(int)dims[0] && _joff>=0) ?
+                        iptr[_joff*strides[1]+ioff_*strides[0]] : 0;
+                To NE = (_ioff>=0 && joff_<(int)dims[1]) ?
+                        iptr[joff_*strides[1]+_ioff*strides[0]] : 0;
+                To SE = (ioff_<(int)dims[0] && joff_<(int)dims[1]) ?
+                        iptr[joff_*strides[1]+ioff_*strides[0]] : 0;
+
+                if (isDX) {
+                    To W  = _joff>=0 ?
+                            iptr[_joff*strides[1]+ioff*strides[0]] : 0;
+
+                    To E  = joff_<(int)dims[1] ?
+                            iptr[joff_*strides[1]+ioff*strides[0]] : 0;
+
+                    accum = NW+SW - (NE+SE) + 2*(W-E);
+                } else {
+                    To N  = _ioff>=0 ?
+                            iptr[joff*strides[1]+_ioff*strides[0]] : 0;
+
+                    To S  = ioff_<(int)dims[0] ?
+                            iptr[joff*strides[1]+ioff_*strides[0]] : 0;
+
+                    accum = NW+NE - (SW+SE) + 2*(N-S);
+                }
+
+                optr[joffset+i*strides[0]] = accum;
+            }
+        }
+
+        optr += strides[2];
+        iptr += strides[2];
+    }
+    optr += strides[3];
+    iptr += strides[3];
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp
new file mode 100644
index 0000000000..292c6383dc
--- /dev/null
+++ b/src/backend/cpu/kernel/sort.hpp
@@ -0,0 +1,52 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+#include <algorithm>
+#include <numeric>
+#include <err_cpu.hpp>
+#include <functional>
+
+namespace cpu
+{
+namespace kernel
+{
+
+// Based off of http://stackoverflow.com/a/12399290
+template<typename T, bool isAscending>
+void sort0(Array<T> val)
+{
+    // initialize original index locations
+    T *val_ptr = val.get();
+
+    function<bool(T, T)> op = std::greater<T>();
+    if(isAscending) { op = std::less<T>(); }
+
+    T *comp_ptr = nullptr;
+    for(dim_t w = 0; w < val.dims()[3]; w++) {
+        dim_t valW = w * val.strides()[3];
+        for(dim_t z = 0; z < val.dims()[2]; z++) {
+            dim_t valWZ = valW + z * val.strides()[2];
+            for(dim_t y = 0; y < val.dims()[1]; y++) {
+
+                dim_t valOffset = valWZ + y * val.strides()[1];
+
+                comp_ptr = val_ptr + valOffset;
+                std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
+            }
+        }
+    }
+    return;
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp
new file mode 100644
index 0000000000..f9d391dc46
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key.hpp
@@ -0,0 +1,86 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+#include <algorithm>
+#include <numeric>
+#include <queue>
+#include <err_cpu.hpp>
+#include <functional>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename Tk, typename Tv, bool isAscending>
+void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
+                  const Array<Tk> ikey, const Array<Tv> ival)
+{
+    function<bool(Tk, Tk)> op = std::greater<Tk>();
+    if(isAscending) { op = std::less<Tk>(); }
+
+    // Get pointers and initialize original index locations
+        uint *oidx_ptr = oidx.get();
+          Tk *okey_ptr = okey.get();
+          Tv *oval_ptr = oval.get();
+    const Tk *ikey_ptr = ikey.get();
+    const Tv *ival_ptr = ival.get();
+
+    std::vector<uint> seq_vec(oidx.dims()[0]);
+    std::iota(seq_vec.begin(), seq_vec.end(), 0);
+
+    const Tk *comp_ptr = nullptr;
+    auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
+
+    for(dim_t w = 0; w < ikey.dims()[3]; w++) {
+        dim_t okeyW = w * okey.strides()[3];
+        dim_t ovalW = w * oval.strides()[3];
+        dim_t oidxW = w * oidx.strides()[3];
+        dim_t ikeyW = w * ikey.strides()[3];
+        dim_t ivalW = w * ival.strides()[3];
+
+        for(dim_t z = 0; z < ikey.dims()[2]; z++) {
+            dim_t okeyWZ = okeyW + z * okey.strides()[2];
+            dim_t ovalWZ = ovalW + z * oval.strides()[2];
+            dim_t oidxWZ = oidxW + z * oidx.strides()[2];
+            dim_t ikeyWZ = ikeyW + z * ikey.strides()[2];
+            dim_t ivalWZ = ivalW + z * ival.strides()[2];
+
+            for(dim_t y = 0; y < ikey.dims()[1]; y++) {
+
+                dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
+                dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
+                dim_t oidxOffset = oidxWZ + y * oidx.strides()[1];
+                dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1];
+                dim_t ivalOffset = ivalWZ + y * ival.strides()[1];
+
+                uint *ptr = oidx_ptr + oidxOffset;
+                std::copy(seq_vec.begin(), seq_vec.end(), ptr);
+
+                comp_ptr = ikey_ptr + ikeyOffset;
+                std::stable_sort(ptr, ptr + ikey.dims()[0], comparator);
+
+                for (dim_t i = 0; i < oval.dims()[0]; ++i){
+                    uint sortIdx = oidx_ptr[oidxOffset + i];
+                    okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx];
+                    oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx];
+                }
+            }
+        }
+    }
+
+    return;
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp
new file mode 100644
index 0000000000..b71cc47071
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_index.hpp
@@ -0,0 +1,71 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+#include <algorithm>
+#include <numeric>
+#include <err_cpu.hpp>
+#include <functional>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, bool isAscending>
+void sort0_index(Array<T> val, Array<uint> idx, const Array<T> in)
+{
+    // initialize original index locations
+       uint *idx_ptr = idx.get();
+          T *val_ptr = val.get();
+    const T *in_ptr  = in.get();
+    function<bool(T, T)> op = std::greater<T>();
+    if(isAscending) { op = std::less<T>(); }
+
+    std::vector<uint> seq_vec(idx.dims()[0]);
+    std::iota(seq_vec.begin(), seq_vec.end(), 0);
+
+    const T *comp_ptr = nullptr;
+    auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
+
+    for(dim_t w = 0; w < in.dims()[3]; w++) {
+        dim_t valW = w * val.strides()[3];
+        dim_t idxW = w * idx.strides()[3];
+        dim_t  inW = w *  in.strides()[3];
+        for(dim_t z = 0; z < in.dims()[2]; z++) {
+            dim_t valWZ = valW + z * val.strides()[2];
+            dim_t idxWZ = idxW + z * idx.strides()[2];
+            dim_t  inWZ =  inW + z *  in.strides()[2];
+            for(dim_t y = 0; y < in.dims()[1]; y++) {
+
+                dim_t valOffset = valWZ + y * val.strides()[1];
+                dim_t idxOffset = idxWZ + y * idx.strides()[1];
+                dim_t inOffset  =  inWZ + y *  in.strides()[1];
+
+                uint *ptr = idx_ptr + idxOffset;
+                std::copy(seq_vec.begin(), seq_vec.end(), ptr);
+
+                comp_ptr = in_ptr + inOffset;
+                std::stable_sort(ptr, ptr + in.dims()[0], comparator);
+
+                for (dim_t i = 0; i < val.dims()[0]; ++i){
+                    val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
+                }
+            }
+        }
+    }
+
+    return;
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/susan.hpp b/src/backend/cpu/kernel/susan.hpp
new file mode 100644
index 0000000000..f543967799
--- /dev/null
+++ b/src/backend/cpu/kernel/susan.hpp
@@ -0,0 +1,99 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void susan_responses(Array<T> output, const Array<T> input,
+                     const unsigned idim0, const unsigned idim1,
+                     const int radius, const float t, const float g,
+                     const unsigned border_len)
+{
+    T* resp_out = output.get();
+    const T* in = input.get();
+
+    const unsigned r = border_len;
+    const int rSqrd = radius*radius;
+
+    for (unsigned y = r; y < idim1 - r; ++y) {
+        for (unsigned x = r; x < idim0 - r; ++x) {
+            const unsigned idx = y * idim0 + x;
+            T m_0 = in[idx];
+            float nM = 0.0f;
+
+            for (int i=-radius; i<=radius; ++i) {
+                for (int j=-radius; j<=radius; ++j) {
+                    if (i*i + j*j < rSqrd) {
+                        int p = x + i;
+                        int q = y + j;
+                        T m = in[p + idim0 * q];
+                        float exp_pow = std::pow((m - m_0)/t, 6.0);
+                        float cM = std::exp(-exp_pow);
+                        nM += cM;
+                    }
+                }
+            }
+
+            resp_out[idx] = nM < g ? g - nM : T(0);
+        }
+    }
+}
+
+template<typename T>
+void non_maximal(Array<float> xcoords, Array<float> ycoords, Array<float> response,
+                 shared_ptr<unsigned> counter, const unsigned idim0, const unsigned idim1,
+                 const Array<T> input, const unsigned border_len, const unsigned max_corners)
+{
+    float* x_out    = xcoords.get();
+    float* y_out    = ycoords.get();
+    float* resp_out = response.get();
+    unsigned* count = counter.get();
+    const T* resp_in= input.get();
+
+    // Responses on the border don't have 8-neighbors to compare, discard them
+    const unsigned r = border_len + 1;
+
+    for (unsigned y = r; y < idim1 - r; y++) {
+        for (unsigned x = r; x < idim0 - r; x++) {
+            const T v = resp_in[y * idim0 + x];
+
+            // Find maximum neighborhood response
+            T max_v;
+            max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]);
+            max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]);
+            max_v = max(max_v, resp_in[(y-1) * idim0 + x  ]);
+            max_v = max(max_v, resp_in[(y+1) * idim0 + x  ]);
+            max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]);
+            max_v = max(max_v, resp_in[(y)   * idim0 + x+1]);
+            max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]);
+
+            // Stores corner to {x,y,resp}_out if it's response is maximum compared
+            // to its 8-neighborhood and greater or equal minimum response
+            if (v > max_v) {
+                const unsigned idx = *count;
+                *count += 1;
+                if (idx < max_corners) {
+                    x_out[idx]    = (float)x;
+                    y_out[idx]    = (float)y;
+                    resp_out[idx] = (float)v;
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/tile.hpp b/src/backend/cpu/kernel/tile.hpp
new file mode 100644
index 0000000000..3ad3009041
--- /dev/null
+++ b/src/backend/cpu/kernel/tile.hpp
@@ -0,0 +1,55 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void tile(Array<T> out, const Array<T> in)
+{
+
+    T* outPtr = out.get();
+    const T* inPtr = in.get();
+
+    const af::dim4 iDims = in.dims();
+    const af::dim4 oDims = out.dims();
+    const af::dim4 ist = in.strides();
+    const af::dim4 ost = out.strides();
+
+    for(dim_t ow = 0; ow < oDims[3]; ow++) {
+        const dim_t iw = ow % iDims[3];
+        const dim_t iW = iw * ist[3];
+        const dim_t oW = ow * ost[3];
+        for(dim_t oz = 0; oz < oDims[2]; oz++) {
+            const dim_t iz = oz % iDims[2];
+            const dim_t iZW = iW + iz * ist[2];
+            const dim_t oZW = oW + oz * ost[2];
+            for(dim_t oy = 0; oy < oDims[1]; oy++) {
+                const dim_t iy = oy % iDims[1];
+                const dim_t iYZW = iZW + iy * ist[1];
+                const dim_t oYZW = oZW + oy * ost[1];
+                for(dim_t ox = 0; ox < oDims[0]; ox++) {
+                    const dim_t ix = ox % iDims[0];
+                    const dim_t iMem = iYZW + ix;
+                    const dim_t oMem = oYZW + ox;
+                    outPtr[oMem] = inPtr[iMem];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/transform.hpp b/src/backend/cpu/kernel/transform.hpp
new file mode 100644
index 0000000000..2311e4efaa
--- /dev/null
+++ b/src/backend/cpu/kernel/transform.hpp
@@ -0,0 +1,131 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <err_cpu.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template <typename T>
+void calc_transform_inverse(T *txo, const T *txi, const bool perspective)
+{
+    if (perspective) {
+        txo[0] =   txi[4]*txi[8] - txi[5]*txi[7];
+        txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]);
+        txo[2] =   txi[1]*txi[5] - txi[2]*txi[4];
+
+        txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]);
+        txo[4] =   txi[0]*txi[8] - txi[2]*txi[6];
+        txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]);
+
+        txo[6] =   txi[3]*txi[7] - txi[4]*txi[6];
+        txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]);
+        txo[8] =   txi[0]*txi[4] - txi[1]*txi[3];
+
+        T det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6];
+
+        txo[0] /= det; txo[1] /= det; txo[2] /= det;
+        txo[3] /= det; txo[4] /= det; txo[5] /= det;
+        txo[6] /= det; txo[7] /= det; txo[8] /= det;
+    }
+    else {
+        T det = txi[0]*txi[4] - txi[1]*txi[3];
+
+        txo[0] = txi[4] / det;
+        txo[1] = txi[3] / det;
+        txo[3] = txi[1] / det;
+        txo[4] = txi[0] / det;
+
+        txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1];
+        txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4];
+    }
+}
+
+template <typename T>
+void calc_transform_inverse(T *tmat, const T *tmat_ptr, const bool inverse,
+                            const bool perspective, const unsigned transf_len)
+{
+    // The way kernel is structured, it expects an inverse
+    // transform matrix by default.
+    // If it is an forward transform, then we need its inverse
+    if(inverse) {
+        for(int i = 0; i < (int)transf_len; i++)
+            tmat[i] = tmat_ptr[i];
+    } else {
+        calc_transform_inverse(tmat, tmat_ptr, perspective);
+    }
+}
+
+template<typename T, af_interp_type method>
+void transform(Array<T> output, const Array<T> input,
+               const Array<float> transform, const bool inverse,
+               const bool perspective)
+{
+    const af::dim4 idims    = input.dims();
+    const af::dim4 odims    = output.dims();
+    const af::dim4 istrides = input.strides();
+    const af::dim4 ostrides = output.strides();
+
+    T * out = output.get();
+    const T * in = input.get();
+    const float* tf = transform.get();
+
+    dim_t nimages     = idims[2];
+    // Multiplied in src/backend/transform.cpp
+    dim_t ntransforms = odims[2] / idims[2];
+
+    void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
+                 const af::dim4 &, const af::dim4 &,
+                 const dim_t, const dim_t, const dim_t, const dim_t,
+                 const bool);
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            t_fn = &transform_n;
+            break;
+        case AF_INTERP_BILINEAR:
+            t_fn = &transform_b;
+            break;
+        case AF_INTERP_LOWER:
+            t_fn = &transform_l;
+            break;
+        default:
+            AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+            break;
+    }
+
+    const int transf_len = (perspective) ? 9 : 6;
+
+    // For each transform channel
+    for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) {
+        // Compute inverse if required
+        const float *tmat_ptr = tf + t_idx * transf_len;
+        float* tmat = new float[transf_len];
+        calc_transform_inverse(tmat, tmat_ptr, inverse, perspective, transf_len);
+
+        // Offset for output pointer
+        dim_t o_offset = t_idx * nimages * ostrides[2];
+
+        // Do transform for image
+        for(int yy = 0; yy < (int)odims[1]; yy++) {
+            for(int xx = 0; xx < (int)odims[0]; xx++) {
+                t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy, perspective);
+            }
+        }
+        delete[] tmat;
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/transpose.hpp b/src/backend/cpu/kernel/transpose.hpp
new file mode 100644
index 0000000000..576de873ed
--- /dev/null
+++ b/src/backend/cpu/kernel/transpose.hpp
@@ -0,0 +1,122 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <utility.hpp>
+#include <err_cpu.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+T getConjugate(const T &in)
+{
+    // For non-complex types return same
+    return in;
+}
+
+template<>
+cfloat getConjugate(const cfloat &in)
+{
+    return std::conj(in);
+}
+
+template<>
+cdouble getConjugate(const cdouble &in)
+{
+    return std::conj(in);
+}
+
+template<typename T, bool conjugate>
+void transpose(Array<T> output, const Array<T> input)
+{
+    const dim4 odims    = output.dims();
+    const dim4 ostrides = output.strides();
+    const dim4 istrides = input.strides();
+
+    T * out = output.get();
+    T const * const in = input.get();
+
+    for (dim_t l = 0; l < odims[3]; ++l) {
+        for (dim_t k = 0; k < odims[2]; ++k) {
+            // Outermost loop handles batch mode
+            // if input has no data along third dimension
+            // this loop runs only once
+            for (dim_t j = 0; j < odims[1]; ++j) {
+                for (dim_t i = 0; i < odims[0]; ++i) {
+                    // calculate array indices based on offsets and strides
+                    // the helper getIdx takes care of indices
+                    const dim_t inIdx  = getIdx(istrides,j,i,k,l);
+                    const dim_t outIdx = getIdx(ostrides,i,j,k,l);
+                    if(conjugate)
+                        out[outIdx] = getConjugate(in[inIdx]);
+                    else
+                        out[outIdx] = in[inIdx];
+                }
+            }
+            // outData and inData pointers doesn't need to be
+            // offset as the getIdx function is taking care
+            // of the batch parameter
+        }
+    }
+}
+
+template<typename T>
+void transpose(Array<T> out, const Array<T> in, const bool conjugate)
+{
+    return (conjugate ? transpose<T, true>(out, in) : transpose<T, false>(out, in));
+}
+
+template<typename T, bool conjugate>
+void transpose_inplace(Array<T> input)
+{
+    const dim4 idims    = input.dims();
+    const dim4 istrides = input.strides();
+
+    T * in = input.get();
+
+    for (dim_t l = 0; l < idims[3]; ++l) {
+        for (dim_t k = 0; k < idims[2]; ++k) {
+            // Outermost loop handles batch mode
+            // if input has no data along third dimension
+            // this loop runs only once
+            //
+            // Run only bottom triangle. std::swap swaps with upper triangle
+            for (dim_t j = 0; j < idims[1]; ++j) {
+                for (dim_t i = j + 1; i < idims[0]; ++i) {
+                    // calculate array indices based on offsets and strides
+                    // the helper getIdx takes care of indices
+                    const dim_t iIdx  = getIdx(istrides,j,i,k,l);
+                    const dim_t oIdx = getIdx(istrides,i,j,k,l);
+                    if(conjugate) {
+                        in[iIdx] = getConjugate(in[iIdx]);
+                        in[oIdx] = getConjugate(in[oIdx]);
+                        std::swap(in[iIdx], in[oIdx]);
+                    }
+                    else {
+                        std::swap(in[iIdx], in[oIdx]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename T>
+void transpose_inplace(Array<T> in, const bool conjugate)
+{
+    return (conjugate ? transpose_inplace<T, true >(in) : transpose_inplace<T, false>(in));
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/triangle.hpp b/src/backend/cpu/kernel/triangle.hpp
new file mode 100644
index 0000000000..7059de5981
--- /dev/null
+++ b/src/backend/cpu/kernel/triangle.hpp
@@ -0,0 +1,61 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, bool is_upper, bool is_unit_diag>
+void triangle(Array<T> out, const Array<T> in)
+{
+    T *o = out.get();
+    const T *i = in.get();
+
+    af::dim4 odm = out.dims();
+
+    af::dim4 ost = out.strides();
+    af::dim4 ist = in.strides();
+
+    for(dim_t ow = 0; ow < odm[3]; ow++) {
+        const dim_t oW = ow * ost[3];
+        const dim_t iW = ow * ist[3];
+
+        for(dim_t oz = 0; oz < odm[2]; oz++) {
+            const dim_t oZW = oW + oz * ost[2];
+            const dim_t iZW = iW + oz * ist[2];
+
+            for(dim_t oy = 0; oy < odm[1]; oy++) {
+                const dim_t oYZW = oZW + oy * ost[1];
+                const dim_t iYZW = iZW + oy * ist[1];
+
+                for(dim_t ox = 0; ox < odm[0]; ox++) {
+                    const dim_t oMem = oYZW + ox;
+                    const dim_t iMem = iYZW + ox;
+
+                    bool cond = is_upper ? (oy >= ox) : (oy <= ox);
+                    bool do_unit_diag = (is_unit_diag && ox == oy);
+                    if(cond) {
+                        o[oMem] = do_unit_diag ? scalar<T>(1) : i[iMem];
+                    } else {
+                        o[oMem] = scalar<T>(0);
+                    }
+
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/unwrap.hpp b/src/backend/cpu/kernel/unwrap.hpp
new file mode 100644
index 0000000000..1d996ff1f3
--- /dev/null
+++ b/src/backend/cpu/kernel/unwrap.hpp
@@ -0,0 +1,81 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <err_cpu.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, int d>
+void unwrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
+                const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+{
+    const T *inPtr = in.get();
+    T *outPtr      = out.get();
+
+    af::dim4 idims    = in.dims();
+    af::dim4 odims    = out.dims();
+    af::dim4 istrides = in.strides();
+    af::dim4 ostrides = out.strides();
+
+    dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
+
+    for(dim_t w = 0; w < odims[3]; w++) {
+        for(dim_t z = 0; z < odims[2]; z++) {
+
+            dim_t cOut = w * ostrides[3] + z * ostrides[2];
+            dim_t cIn  = w * istrides[3] + z * istrides[2];
+            const T* iptr = inPtr  + cIn;
+            T* optr_= outPtr + cOut;
+
+            for(dim_t col = 0; col < odims[d]; col++) {
+                // Offset output ptr
+                T* optr = optr_ + col * ostrides[d];
+
+                // Calculate input window index
+                dim_t winy = (col / nx);
+                dim_t winx = (col % nx);
+
+                dim_t startx = winx * sx;
+                dim_t starty = winy * sy;
+
+                dim_t spx = startx - px;
+                dim_t spy = starty - py;
+
+                // Short cut condition ensuring all values within input dimensions
+                bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]);
+
+                for(dim_t y = 0; y < wy; y++) {
+                    for(dim_t x = 0; x < wx; x++) {
+                        dim_t xpad = spx + x;
+                        dim_t ypad = spy + y;
+
+                        dim_t oloc = (y * wx + x);
+                        if (d == 0) oloc *= ostrides[1];
+
+                        if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) {
+                            dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]);
+                            optr[oloc] = iptr[iloc];
+                        } else {
+                            optr[oloc] = scalar<T>(0.0);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/wrap.hpp b/src/backend/cpu/kernel/wrap.hpp
new file mode 100644
index 0000000000..70be3ad652
--- /dev/null
+++ b/src/backend/cpu/kernel/wrap.hpp
@@ -0,0 +1,80 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <err_cpu.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, int d>
+void wrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
+              const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+{
+    const T *inPtr = in.get();
+    T *outPtr      = out.get();
+
+    af::dim4 idims    = in.dims();
+    af::dim4 odims    = out.dims();
+    af::dim4 istrides = in.strides();
+    af::dim4 ostrides = out.strides();
+
+    dim_t nx = (odims[0] + 2 * px - wx) / sx + 1;
+
+    for(dim_t w = 0; w < idims[3]; w++) {
+        for(dim_t z = 0; z < idims[2]; z++) {
+
+            dim_t cIn  = w * istrides[3] + z * istrides[2];
+            dim_t cOut = w * ostrides[3] + z * ostrides[2];
+            const T* iptr_ = inPtr  + cIn;
+            T* optr= outPtr + cOut;
+
+            for(dim_t col = 0; col < idims[d]; col++) {
+                // Offset output ptr
+                const T* iptr = iptr_ + col * istrides[d];
+
+                // Calculate input window index
+                dim_t winy = (col / nx);
+                dim_t winx = (col % nx);
+
+                dim_t startx = winx * sx;
+                dim_t starty = winy * sy;
+
+                dim_t spx = startx - px;
+                dim_t spy = starty - py;
+
+                // Short cut condition ensuring all values within input dimensions
+                bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]);
+
+                for(dim_t y = 0; y < wy; y++) {
+                    for(dim_t x = 0; x < wx; x++) {
+                        dim_t xpad = spx + x;
+                        dim_t ypad = spy + y;
+
+                        dim_t iloc = (y * wx + x);
+                        if (d == 0) iloc *= istrides[1];
+
+                        if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) {
+                            dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]);
+                            // FIXME: When using threads, atomize this
+                            optr[oloc] += iptr[iloc];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/lapack_helper.hpp b/src/backend/cpu/lapack_helper.hpp
index f978ecb92b..c5ed4fa83f 100644
--- a/src/backend/cpu/lapack_helper.hpp
+++ b/src/backend/cpu/lapack_helper.hpp
@@ -17,17 +17,17 @@
 #define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR
 #define LAPACK_NAME(fn) LAPACKE_##fn
 
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#include <lapacke.hpp>
-#undef AF_LAPACK_COL_MAJOR
-#define AF_LAPACK_COL_MAJOR 0
-#else
 #ifdef USE_MKL
-#include<mkl_lapacke.h>
-#else // NETLIB LAPACKE
-#include<lapacke.h>
-#endif
+    #include<mkl_lapacke.h>
+#else
+    #ifdef __APPLE__
+        #include <Accelerate/Accelerate.h>
+        #include <lapacke.hpp>
+        #undef AF_LAPACK_COL_MAJOR
+        #define AF_LAPACK_COL_MAJOR 0
+    #else // NETLIB LAPACKE
+        #include<lapacke.h>
+    #endif
 #endif
 
 #endif
diff --git a/src/backend/cpu/lookup.cpp b/src/backend/cpu/lookup.cpp
index 128cc02823..1e09f4dd48 100644
--- a/src/backend/cpu/lookup.cpp
+++ b/src/backend/cpu/lookup.cpp
@@ -8,33 +8,21 @@
  ********************************************************/
 
 #include <lookup.hpp>
-#include <err_cpu.hpp>
 #include <cstdlib>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/lookup.hpp>
 
 namespace cpu
 {
 
-static inline
-dim_t trimIndex(int idx, const dim_t &len)
-{
-    int ret_val = idx;
-    int offset  = abs(ret_val)%len;
-    if (ret_val<0) {
-        ret_val = offset-1;
-    } else if (ret_val>=len) {
-        ret_val = len-offset-1;
-    }
-    return ret_val;
-}
-
 template<typename in_t, typename idx_t>
 Array<in_t> lookup(const Array<in_t> &input, const Array<idx_t> &indices, const unsigned dim)
 {
-    const dim4 iDims = input.dims();
-    const dim4 iStrides = input.strides();
+    input.eval();
+    indices.eval();
 
-    const in_t *inPtr = input.get();
-    const idx_t *idxPtr = indices.get();
+    const dim4 iDims = input.dims();
 
     dim4 oDims(1);
     for (int d=0; d<4; ++d)
@@ -42,35 +30,7 @@ Array<in_t> lookup(const Array<in_t> &input, const Array<idx_t> &indices, const
 
     Array<in_t> out = createEmptyArray<in_t>(oDims);
 
-    dim4 oStrides = out.strides();
-
-    in_t *outPtr = out.get();
-
-    for (dim_t l=0; l<oDims[3]; ++l) {
-
-        dim_t iLOff = iStrides[3]*(dim==3 ? trimIndex((dim_t)idxPtr[l], iDims[3]): l);
-        dim_t oLOff = l*oStrides[3];
-
-        for (dim_t k=0; k<oDims[2]; ++k) {
-
-            dim_t iKOff = iStrides[2]*(dim==2 ? trimIndex((dim_t)idxPtr[k], iDims[2]): k);
-            dim_t oKOff = k*oStrides[2];
-
-            for (dim_t j=0; j<oDims[1]; ++j) {
-
-                dim_t iJOff = iStrides[1]*(dim==1 ? trimIndex((dim_t)idxPtr[j], iDims[1]): j);
-                dim_t oJOff = j*oStrides[1];
-
-                for (dim_t i=0; i<oDims[0]; ++i) {
-
-                    dim_t iIOff = iStrides[0]*(dim==0 ? trimIndex((dim_t)idxPtr[i], iDims[0]): i);
-                    dim_t oIOff = i*oStrides[0];
-
-                    outPtr[oLOff+oKOff+oJOff+oIOff] = inPtr[iLOff+iKOff+iJOff+iIOff];
-                }
-            }
-        }
-    }
+    getQueue().enqueue(kernel::lookup<in_t, idx_t>, out, input, indices, dim);
 
     return out;
 }
diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp
index 0eefb16816..24ca4acd78 100644
--- a/src/backend/cpu/lu.cpp
+++ b/src/backend/cpu/lu.cpp
@@ -11,23 +11,21 @@
 #include <err_common.hpp>
 
 #if defined(WITH_CPU_LINEAR_ALGEBRA)
-
 #include <af/dim4.hpp>
 #include <handle.hpp>
 #include <iostream>
 #include <cassert>
-#include <err_cpu.hpp>
-
 #include <range.hpp>
 #include <lapack_helper.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/lu.hpp>
 
 namespace cpu
 {
 
 template<typename T>
-using getrf_func_def = int (*)(ORDER_TYPE, int, int,
-                               T*, int,
-                               int*);
+using getrf_func_def = int (*)(ORDER_TYPE, int, int, T*, int, int*);
 
 #define LU_FUNC_DEF( FUNC )                                     \
 template<typename T> FUNC##_func_def<T> FUNC##_func();
@@ -43,78 +41,14 @@ LU_FUNC(getrf , double , d)
 LU_FUNC(getrf , cfloat , c)
 LU_FUNC(getrf , cdouble, z)
 
-template<typename T>
-void lu_split(Array<T> &lower, Array<T> &upper, const Array<T> &in)
-{
-    T *l = lower.get();
-    T *u = upper.get();
-    const T *i = in.get();
-
-    dim4 ldm = lower.dims();
-    dim4 udm = upper.dims();
-    dim4 idm = in.dims();
-
-    dim4 lst = lower.strides();
-    dim4 ust = upper.strides();
-    dim4 ist = in.strides();
-
-    for(dim_t ow = 0; ow < idm[3]; ow++) {
-        const dim_t lW = ow * lst[3];
-        const dim_t uW = ow * ust[3];
-        const dim_t iW = ow * ist[3];
-
-        for(dim_t oz = 0; oz < idm[2]; oz++) {
-            const dim_t lZW = lW + oz * lst[2];
-            const dim_t uZW = uW + oz * ust[2];
-            const dim_t iZW = iW + oz * ist[2];
-
-            for(dim_t oy = 0; oy < idm[1]; oy++) {
-                const dim_t lYZW = lZW + oy * lst[1];
-                const dim_t uYZW = uZW + oy * ust[1];
-                const dim_t iYZW = iZW + oy * ist[1];
-
-                for(dim_t ox = 0; ox < idm[0]; ox++) {
-                    const dim_t lMem = lYZW + ox;
-                    const dim_t uMem = uYZW + ox;
-                    const dim_t iMem = iYZW + ox;
-                    if(ox > oy) {
-                        if(oy < ldm[1])
-                            l[lMem] = i[iMem];
-                        if(ox < udm[0])
-                            u[uMem] = scalar<T>(0);
-                    } else if (oy > ox) {
-                        if(oy < ldm[1])
-                            l[lMem] = scalar<T>(0);
-                        if(ox < udm[0])
-                            u[uMem] = i[iMem];
-                    } else if(ox == oy) {
-                        if(oy < ldm[1])
-                            l[lMem] = scalar<T>(1.0);
-                        if(ox < udm[0])
-                            u[uMem] = i[iMem];
-                    }
-                }
-            }
-        }
-    }
-}
-
-void convertPivot(Array<int> &pivot, int out_sz)
-{
-    Array<int> p = range<int>(dim4(out_sz), 0);
-    int *d_pi = pivot.get();
-    int *d_po = p.get();
-    dim_t d0 = pivot.dims()[0];
-    for(int j = 0; j < (int)d0; j++) {
-        // 1 indexed in pivot
-        std::swap(d_po[j], d_po[d_pi[j] - 1]);
-    }
-    pivot = p;
-}
-
 template<typename T>
 void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
 {
+    lower.eval();
+    upper.eval();
+    pivot.eval();
+    in.eval();
+
     dim4 iDims = in.dims();
     int M = iDims[0];
     int N = iDims[1];
@@ -128,35 +62,36 @@ void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
     lower = createEmptyArray<T>(ldims);
     upper = createEmptyArray<T>(udims);
 
-    lu_split<T>(lower, upper, in_copy);
+    getQueue().enqueue(kernel::lu_split<T>, lower, upper, in_copy);
 }
 
 template<typename T>
 Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
 {
-    dim4 iDims = in.dims();
-    int M = iDims[0];
-    int N = iDims[1];
-
-    Array<int> pivot = createEmptyArray<int>(af::dim4(min(M, N), 1, 1, 1));
+    in.eval();
 
-    getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
-                    in.get(), in.strides()[1],
-                    pivot.get());
-
-    if(convert_pivot) convertPivot(pivot, M);
-
-    return pivot;
+    dim4 iDims = in.dims();
+    Array<int> pivot = createEmptyArray<int>(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1));
+
+    auto func = [=] (Array<T> in, Array<int> pivot) {
+        dim4 iDims = in.dims();
+        getrf_func<T>()(AF_LAPACK_COL_MAJOR, iDims[0], iDims[1], in.get(), in.strides()[1], pivot.get());
+    };
+    getQueue().enqueue(func, in, pivot);
+
+    if(convert_pivot) {
+        Array<int> p = range<int>(dim4(iDims[0]), 0);
+        getQueue().enqueue(kernel::convertPivot, p, pivot);
+        return p;
+    } else {
+        return pivot;
+    }
 }
 
-#define INSTANTIATE_LU(T)                                                                           \
-    template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
-    template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
-
-INSTANTIATE_LU(float)
-INSTANTIATE_LU(cfloat)
-INSTANTIATE_LU(double)
-INSTANTIATE_LU(cdouble)
+bool isLAPACKAvailable()
+{
+    return true;
+}
 
 }
 
@@ -177,6 +112,18 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
     AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
 }
 
+bool isLAPACKAvailable()
+{
+    return false;
+}
+
+}
+
+#endif
+
+namespace cpu
+{
+
 #define INSTANTIATE_LU(T)                                                                           \
     template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
     template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
@@ -187,5 +134,3 @@ INSTANTIATE_LU(double)
 INSTANTIATE_LU(cdouble)
 
 }
-
-#endif
diff --git a/src/backend/cpu/lu.hpp b/src/backend/cpu/lu.hpp
index c25dcaaa16..3fef461067 100644
--- a/src/backend/cpu/lu.hpp
+++ b/src/backend/cpu/lu.hpp
@@ -17,4 +17,6 @@ namespace cpu
 
     template<typename T>
     Array<int> lu_inplace(Array<T> &in, const bool convert_pivot = true);
+
+    bool isLAPACKAvailable();
 }
diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp
index 4d930145d5..58091a1f49 100644
--- a/src/backend/cpu/match_template.cpp
+++ b/src/backend/cpu/match_template.cpp
@@ -12,132 +12,24 @@
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <match_template.hpp>
-#include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/match_template.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<typename inType, typename outType, af_match_type mType>
-Array<outType> match_template(const Array<inType> &sImg, const Array<inType> &tImg)
+template<typename InT, typename OutT, af_match_type MatchT>
+Array<OutT> match_template(const Array<InT> &sImg, const Array<InT> &tImg)
 {
-    const dim4 sDims = sImg.dims();
-    const dim4 tDims = tImg.dims();
-    const dim4 sStrides = sImg.strides();
-    const dim4 tStrides = tImg.strides();
+    sImg.eval();
+    tImg.eval();
 
-    const dim_t tDim0  = tDims[0];
-    const dim_t tDim1  = tDims[1];
-    const dim_t sDim0  = sDims[0];
-    const dim_t sDim1  = sDims[1];
+    Array<OutT> out = createEmptyArray<OutT>(sImg.dims());
 
-    Array<outType> out = createEmptyArray<outType>(sDims);
-    const dim4 oStrides = out.strides();
-
-    outType tImgMean = outType(0);
-    dim_t winNumElements = tImg.elements();
-    bool needMean = mType==AF_ZSAD || mType==AF_LSAD ||
-                    mType==AF_ZSSD || mType==AF_LSSD ||
-                    mType==AF_ZNCC;
-    const inType * tpl = tImg.get();
-
-    if (needMean) {
-        for(dim_t tj=0; tj<tDim1; tj++) {
-            dim_t tjStride = tj*tStrides[1];
-
-            for(dim_t ti=0; ti<tDim0; ti++) {
-                tImgMean += (outType)tpl[tjStride+ti*tStrides[0]];
-            }
-        }
-        tImgMean /= winNumElements;
-    }
-
-    outType * dst      = out.get();
-    const inType * src = sImg.get();
-
-    for(dim_t b3=0; b3<sDims[3]; ++b3) {
-    for(dim_t b2=0; b2<sDims[2]; ++b2) {
-
-        // slide through image window after window
-        for(dim_t sj=0; sj<sDim1; sj++) {
-
-            dim_t ojStride = sj*oStrides[1];
-
-            for(dim_t si=0; si<sDim0; si++) {
-                outType disparity = outType(0);
-
-                // mean for window
-                // this variable will be used based on mType value
-                outType wImgMean = outType(0);
-                if (needMean) {
-                    for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) {
-                        dim_t jStride = j*sStrides[1];
-
-                        for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) {
-                            inType sVal = ((j<sDim1 && i<sDim0) ?
-                                    src[jStride + i*sStrides[0]] : inType(0));
-                            wImgMean += (outType)sVal;
-                        }
-                    }
-                    wImgMean /= winNumElements;
-                }
-
-                // run the window match metric
-                for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) {
-                    dim_t jStride = j*sStrides[1];
-                    dim_t tjStride = tj*tStrides[1];
-
-                    for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) {
-                        inType sVal = ((j<sDim1 && i<sDim0) ?
-                                            src[jStride + i*sStrides[0]] : inType(0));
-                        inType tVal = tpl[tjStride+ti*tStrides[0]];
-                        outType temp;
-                        switch(mType) {
-                            case AF_SAD:
-                                disparity += fabs((outType)sVal-(outType)tVal);
-                                break;
-                            case AF_ZSAD:
-                                disparity += fabs((outType)sVal - wImgMean -
-                                                  (outType)tVal + tImgMean);
-                                break;
-                            case AF_LSAD:
-                                disparity += fabs((outType)sVal-(wImgMean/tImgMean)*tVal);
-                                break;
-                            case AF_SSD:
-                                disparity += ((outType)sVal-(outType)tVal)*((outType)sVal-(outType)tVal);
-                                break;
-                            case AF_ZSSD:
-                                temp = ((outType)sVal - wImgMean - (outType)tVal + tImgMean);
-                                disparity += temp*temp;
-                                break;
-                            case AF_LSSD:
-                                temp = ((outType)sVal-(wImgMean/tImgMean)*tVal);
-                                disparity += temp*temp;
-                                break;
-                            case AF_NCC:
-                                //TODO: furture implementation
-                                break;
-                            case AF_ZNCC:
-                                //TODO: furture implementation
-                                break;
-                            case AF_SHD:
-                                //TODO: furture implementation
-                                break;
-                        }
-                    }
-                }
-                // output is just created, hence not doing the
-                // extra multiplication for 0th dim stride
-                dst[ojStride + si] = disparity;
-            }
-        }
-        src += sStrides[2];
-        dst += oStrides[2];
-    }
-        src += sStrides[3];
-        dst += oStrides[3];
-    }
+    getQueue().enqueue(kernel::matchTemplate<OutT, InT, MatchT>, out, sImg, tImg);
 
     return out;
 }
diff --git a/src/backend/cpu/math.cpp b/src/backend/cpu/math.cpp
index 5a6bcbc67e..e00fd78fcd 100644
--- a/src/backend/cpu/math.cpp
+++ b/src/backend/cpu/math.cpp
@@ -11,39 +11,41 @@
 
 namespace cpu
 {
-    uint abs(uint val) { return val; }
-    uchar abs(uchar val) { return val; }
-    uintl abs(uintl val) { return val; }
-
-    cfloat  scalar(float val)
-    {
-        cfloat  cval = {(float)val, 0};
-        return cval;
-    }
-
-    cdouble scalar(double val)
-    {
-        cdouble  cval = {val, 0};
-        return cval;
-    }
-
-    cfloat min(cfloat lhs, cfloat rhs)
-    {
-        return abs(lhs) < abs(rhs) ? lhs : rhs;
-    }
-
-    cdouble min(cdouble lhs, cdouble rhs)
-    {
-        return abs(lhs) < abs(rhs) ? lhs : rhs;
-    }
-
-    cfloat max(cfloat lhs, cfloat rhs)
-    {
-        return abs(lhs) > abs(rhs) ? lhs : rhs;
-    }
-
-    cdouble max(cdouble lhs, cdouble rhs)
-    {
-        return abs(lhs) > abs(rhs) ? lhs : rhs;
-    }
+
+uint abs(uint val) { return val; }
+uchar abs(uchar val) { return val; }
+uintl abs(uintl val) { return val; }
+
+cfloat  scalar(float val)
+{
+    cfloat  cval = {(float)val, 0};
+    return cval;
+}
+
+cdouble scalar(double val)
+{
+    cdouble  cval = {val, 0};
+    return cval;
+}
+
+cfloat min(cfloat lhs, cfloat rhs)
+{
+    return abs(lhs) < abs(rhs) ? lhs : rhs;
+}
+
+cdouble min(cdouble lhs, cdouble rhs)
+{
+    return abs(lhs) < abs(rhs) ? lhs : rhs;
+}
+
+cfloat max(cfloat lhs, cfloat rhs)
+{
+    return abs(lhs) > abs(rhs) ? lhs : rhs;
+}
+
+cdouble max(cdouble lhs, cdouble rhs)
+{
+    return abs(lhs) > abs(rhs) ? lhs : rhs;
+}
+
 }
diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp
index b52eaf9387..b5bbf758a1 100644
--- a/src/backend/cpu/meanshift.cpp
+++ b/src/backend/cpu/meanshift.cpp
@@ -16,6 +16,9 @@
 #include <algorithm>
 #include <err_cpu.hpp>
 #include <math.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/meanshift.hpp>
 
 using af::dim4;
 using std::vector;
@@ -23,125 +26,15 @@ using std::vector;
 namespace cpu
 {
 
-inline dim_t clamp(dim_t a, dim_t mn, dim_t mx)
-{
-    return (a<mn ? mn : (a>mx ? mx : a));
-}
-
 template<typename T, bool is_color>
 Array<T>  meanshift(const Array<T> &in, const float &s_sigma, const float &c_sigma, const unsigned iter)
 {
-    const dim4 dims     = in.dims();
-    const dim4 istrides = in.strides();
-    Array<T> out        = createEmptyArray<T>(dims);
-    const dim4 ostrides = out.strides();
-
-    const dim_t bCount   = (is_color ? 1 : dims[2]);
-    const dim_t channels = (is_color ? dims[2] : 1);
-
-    // clamp spatical and chromatic sigma's
-    float space_          = std::min(11.5f, s_sigma);
-    const dim_t radius = std::max((int)(space_ * 1.5f), 1);
-    const float cvar      = c_sigma*c_sigma;
-
-    vector<float> means;
-    vector<float> centers;
-    vector<float> tmpclrs;
-    means.reserve(channels);
-    centers.reserve(channels);
-    tmpclrs.reserve(channels);
-
-    T *outData       = out.get();
-    const T * inData = in.get();
-
-    for(dim_t b3=0; b3<dims[3]; ++b3) {
-        for(dim_t b2=0; b2<bCount; ++b2) {
-
-            for(dim_t j=0; j<dims[1]; ++j) {
-
-                dim_t j_in_off  = j*istrides[1];
-                dim_t j_out_off = j*ostrides[1];
-
-                for(dim_t i=0; i<dims[0]; ++i) {
-
-                    dim_t i_in_off  = i*istrides[0];
-                    dim_t i_out_off = i*ostrides[0];
-
-                    // clear means and centers for this pixel
-                    for(dim_t ch=0; ch<channels; ++ch) {
-                        means[ch] = 0.0f;
-                        // the expression ch*istrides[2] will only effect when ch>1
-                        // i.e for color images where batch is along fourth dimension
-                        centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]];
-                    }
-
-                    // scope of meanshift iterationd begin
-                    for(unsigned it=0; it<iter; ++it) {
-
-                        int count   = 0;
-                        int shift_x = 0;
-                        int shift_y = 0;
-
-                        for(dim_t wj=-radius; wj<=radius; ++wj) {
-
-                            int hit_count = 0;
-
-                            for(dim_t wi=-radius; wi<=radius; ++wi) {
-
-                                dim_t tj = j + wj;
-                                dim_t ti = i + wi;
-
-                                // clamps offsets
-                                tj = clamp(tj, 0ll, dims[1]-1);
-                                ti = clamp(ti, 0ll, dims[0]-1);
-
-                                // proceed
-                                float norm = 0.0f;
-                                for(dim_t ch=0; ch<channels; ++ch) {
-                                    tmpclrs[ch] = inData[ tj*istrides[1] + ti*istrides[0] + ch*istrides[2]];
-                                    norm += (centers[ch]-tmpclrs[ch]) * (centers[ch]-tmpclrs[ch]);
-                                }
-
-                                if (norm<= cvar) {
-                                    for(dim_t ch=0; ch<channels; ++ch)
-                                        means[ch] += tmpclrs[ch];
-                                    shift_x += wi;
-                                    ++hit_count;
-                                }
-
-                            }
-                            count+= hit_count;
-                            shift_y += wj*hit_count;
-                        }
-
-                        if (count==0) { break; }
-
-                        const float fcount = 1.f/count;
-                        const int mean_x = (int)(shift_x*fcount+0.5f);
-                        const int mean_y = (int)(shift_y*fcount+0.5f);
-                        for(dim_t ch=0; ch<channels; ++ch)
-                            means[ch] *= fcount;
+    in.eval();
 
-                        float norm = 0.f;
-                        for(dim_t ch=0; ch<channels; ++ch)
-                            norm += ((means[ch]-centers[ch])*(means[ch]-centers[ch]));
-                        bool stop = ((abs(shift_y-mean_y)+abs(shift_x-mean_x)) + norm) <= 1;
-                        shift_x = mean_x;
-                        shift_y = mean_y;
-                        for(dim_t ch=0; ch<channels; ++ch)
-                            centers[ch] = means[ch];
-                        if (stop) { break; }
-                    } // scope of meanshift iterations end
+    Array<T> out = createEmptyArray<T>(in.dims());
 
-                    for(dim_t ch=0; ch<channels; ++ch)
-                        outData[j_out_off + i_out_off + ch*ostrides[2]] = centers[ch];
+    getQueue().enqueue(kernel::meanShift<T, is_color>, out, in, s_sigma, c_sigma, iter);
 
-                }
-            }
-            outData += ostrides[2];
-            inData  += istrides[2];
-        }
-    }
     return out;
 }
 
diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp
index 3ded3c045a..8ae4e33921 100644
--- a/src/backend/cpu/medfilt.cpp
+++ b/src/backend/cpu/medfilt.cpp
@@ -12,8 +12,9 @@
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <medfilt.hpp>
-#include <err_cpu.hpp>
-#include <algorithm>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/medfilt.hpp>
 
 using af::dim4;
 
@@ -23,114 +24,11 @@ namespace cpu
 template<typename T, af_border_type pad>
 Array<T> medfilt(const Array<T> &in, dim_t w_len, dim_t w_wid)
 {
-    const dim4 dims     = in.dims();
-    const dim4 istrides = in.strides();
-    Array<T> out        = createEmptyArray<T>(dims);
-    const dim4 ostrides = out.strides();
+    in.eval();
 
-    std::vector<T> wind_vals;
-    wind_vals.reserve(w_len*w_wid);
+    Array<T> out = createEmptyArray<T>(in.dims());
 
-    T const * in_ptr = in.get();
-    T * out_ptr = out.get();
-
-    for(int b3=0; b3<(int)dims[3]; b3++) {
-
-        for(int b2=0; b2<(int)dims[2]; b2++) {
-
-            for(int col=0; col<(int)dims[1]; col++) {
-
-                int ocol_off = col*ostrides[1];
-
-                for(int row=0; row<(int)dims[0]; row++) {
-
-                    wind_vals.clear();
-
-                    for(int wj=0; wj<(int)w_wid; ++wj) {
-
-                        bool isColOff = false;
-
-                        int im_col = col + wj-w_wid/2;
-                        int im_coff;
-                        switch(pad) {
-                            case AF_PAD_ZERO:
-                                im_coff = im_col * istrides[1];
-                                if (im_col < 0 || im_col>=(int)dims[1])
-                                    isColOff = true;
-                                break;
-                            case AF_PAD_SYM:
-                                {
-                                    if (im_col < 0) {
-                                        im_col *= -1;
-                                        isColOff = true;
-                                    }
-
-                                    if (im_col>=(int)dims[1]) {
-                                        im_col = 2*((int)dims[1]-1) - im_col;
-                                        isColOff = true;
-                                    }
-
-                                    im_coff = im_col * istrides[1];
-                                }
-                                break;
-                        }
-
-                        for(int wi=0; wi<(int)w_len; ++wi) {
-
-                            bool isRowOff = false;
-
-                            int im_row = row + wi-w_len/2;
-                            int im_roff;
-                            switch(pad) {
-                                case AF_PAD_ZERO:
-                                    im_roff = im_row * istrides[0];
-                                    if (im_row < 0 || im_row>=(int)dims[0])
-                                        isRowOff = true;
-                                    break;
-                                case AF_PAD_SYM:
-                                    {
-                                        if (im_row < 0) {
-                                            im_row *= -1;
-                                            isRowOff = true;
-                                        }
-
-                                        if (im_row>=(int)dims[0]) {
-                                            im_row = 2*((int)dims[0]-1) - im_row;
-                                            isRowOff = true;
-                                        }
-
-                                        im_roff = im_row * istrides[0];
-                                    }
-                                    break;
-                            }
-
-                            if(isRowOff || isColOff) {
-                                switch(pad) {
-                                    case AF_PAD_ZERO:
-                                        wind_vals.push_back(0);
-                                        break;
-                                    case AF_PAD_SYM:
-                                        wind_vals.push_back(in_ptr[im_coff+im_roff]);
-                                        break;
-                                }
-                            } else
-                                wind_vals.push_back(in_ptr[im_coff+im_roff]);
-                        }
-                    }
-
-                    std::stable_sort(wind_vals.begin(),wind_vals.end());
-                    int off = wind_vals.size()/2;
-                    if (wind_vals.size()%2==0)
-                        out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2;
-                    else {
-                        out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off];
-                    }
-                }
-            }
-            in_ptr  += istrides[2];
-            out_ptr += ostrides[2];
-        }
-    }
+    getQueue().enqueue(kernel::medfilt<T, pad>, out, in, w_len, w_wid);
 
     return out;
 }
diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp
index ac10643c9b..b4b1b450d9 100644
--- a/src/backend/cpu/memory.cpp
+++ b/src/backend/cpu/memory.cpp
@@ -10,237 +10,195 @@
 #include <memory.hpp>
 #include <err_cpu.hpp>
 #include <types.hpp>
-#include <map>
-#include <dispatch.hpp>
-#include <cstdlib>
-#include <mutex>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <memory>
+#include <MemoryManager.hpp>
 
-namespace cpu
-{
-
-    static size_t memory_resolution = 1024; //1KB
+#ifndef AF_MEM_DEBUG
+#define AF_MEM_DEBUG 0
+#endif
 
-    void setMemStepSize(size_t step_bytes)
-    {
-        memory_resolution = step_bytes;
-    }
+#ifndef AF_CPU_MEM_DEBUG
+#define AF_CPU_MEM_DEBUG 0
+#endif
 
-    size_t getMemStepSize(void)
-    {
-        return memory_resolution;
-    }
+namespace cpu
+{
 
-    class Manager
+class MemoryManager  : public common::MemoryManager
+{
+    int getActiveDeviceId();
+    size_t getMaxMemorySize(int id);
+public:
+    MemoryManager();
+    void *nativeAlloc(const size_t bytes);
+    void nativeFree(void *ptr);
+    ~MemoryManager()
     {
-        public:
-        static bool initialized;
-        Manager()
-        {
-            initialized = true;
-        }
-
-        ~Manager()
-        {
-            garbageCollect();
+        common::lock_guard_t lock(this->memory_mutex);
+        for (int n = 0; n < getDeviceCount(); n++) {
+            cpu::setDevice(n);
+            this->garbageCollect();
         }
-    };
-
-    bool Manager::initialized = false;
-
-    static void managerInit()
-    {
-        if(Manager::initialized == false)
-            static Manager pm = Manager();
     }
+};
 
-    typedef struct
-    {
-        bool is_free;
-        bool is_unlinked;
-        size_t bytes;
-    } mem_info;
-
-    static size_t used_bytes = 0;
-    static size_t used_buffers = 0;
-    static size_t total_bytes = 0;
-    typedef std::map<void *, mem_info> mem_t;
-    typedef mem_t::iterator mem_iter;
-
-    mem_t memory_map;
-    std::mutex memory_map_mutex;
-
-    template<typename T>
-    void freeWrapper(T *ptr)
-    {
-        free((void *)ptr);
-    }
-
-    void garbageCollect()
-    {
-        for(mem_iter iter = memory_map.begin();
-            iter != memory_map.end(); ++iter) {
-
-            if ((iter->second).is_free) {
-
-                if (!(iter->second).is_unlinked) {
-                    freeWrapper(iter->first);
-                    total_bytes -= iter->second.bytes;
-                }
-            }
-        }
-
-        mem_iter memory_curr = memory_map.begin();
-        mem_iter memory_end  = memory_map.end();
+int MemoryManager::getActiveDeviceId()
+{
+    return cpu::getActiveDeviceId();
+}
 
-        while(memory_curr != memory_end) {
-            if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) {
-                memory_map.erase(memory_curr++);
-            } else {
-                ++memory_curr;
-            }
-        }
-    }
+size_t MemoryManager::getMaxMemorySize(int id)
+{
+    return cpu::getDeviceMemorySize(id);
+}
 
-    template<typename T>
-    T* memAlloc(const size_t &elements)
-    {
-        managerInit();
+MemoryManager::MemoryManager() :
+    common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CPU_MEM_DEBUG)
+{
+    this->setMaxMemorySize();
+}
 
-        T* ptr = NULL;
-        size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution;
 
-        if (elements > 0) {
-            std::lock_guard<std::mutex> lock(memory_map_mutex);
+void *MemoryManager::nativeAlloc(const size_t bytes)
+{
+    void *ptr = malloc(bytes);
+    if (!ptr) AF_ERROR("Unable to allocate memory", AF_ERR_NO_MEM);
+    return ptr;
+}
 
-            // FIXME: Add better checks for garbage collection
-            // Perhaps look at total memory available as a metric
-            if (memory_map.size() > MAX_BUFFERS ||
-                used_bytes >= MAX_BYTES) {
+void MemoryManager::nativeFree(void *ptr)
+{
+    return free((void *)ptr);
+}
 
-                garbageCollect();
-            }
+static MemoryManager &getMemoryManager()
+{
+    static MemoryManager instance;
+    return instance;
+}
 
-            for(mem_iter iter = memory_map.begin();
-                iter != memory_map.end(); ++iter) {
+void setMemStepSize(size_t step_bytes)
+{
+    getMemoryManager().setMemStepSize(step_bytes);
+}
 
-                mem_info info = iter->second;
+size_t getMemStepSize(void)
+{
+    return getMemoryManager().getMemStepSize();
+}
 
-                if ( info.is_free &&
-                    !info.is_unlinked &&
-                     info.bytes == alloc_bytes) {
+size_t getMaxBytes()
+{
+    return getMemoryManager().getMaxBytes();
+}
 
-                    iter->second.is_free = false;
-                    used_bytes += alloc_bytes;
-                    used_buffers++;
-                    return (T *)iter->first;
-                }
-            }
+unsigned getMaxBuffers()
+{
+    return getMemoryManager().getMaxBuffers();
+}
 
-            // Perform garbage collection if memory can not be allocated
-            ptr = (T *)malloc(alloc_bytes);
+void garbageCollect()
+{
+    getMemoryManager().garbageCollect();
+}
 
-            if (ptr == NULL) {
-                AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM);
-            }
+void printMemInfo(const char *msg, const int device)
+{
+    getMemoryManager().printInfo(msg, device);
+}
 
-            mem_info info = {false, false, alloc_bytes};
-            memory_map[ptr] = info;
+template<typename T>
+T* memAlloc(const size_t &elements)
+{
+    T *ptr = nullptr;
 
-            used_bytes += alloc_bytes;
-            used_buffers++;
-            total_bytes += alloc_bytes;
-        }
-        return ptr;
+    try {
+        ptr = (T *)getMemoryManager().alloc(elements * sizeof(T), false);
+    } catch(...) {
+        getQueue().sync();
+        ptr = (T *)getMemoryManager().alloc(elements * sizeof(T), false);
     }
+    return ptr;
+}
 
-    template<typename T>
-    void memFree(T *ptr)
-    {
-        std::lock_guard<std::mutex> lock(memory_map_mutex);
-
-        mem_iter iter = memory_map.find((void *)ptr);
-
-        if (iter != memory_map.end()) {
-
-            iter->second.is_free = true;
-            if ((iter->second).is_unlinked) return;
-
-            used_bytes -= iter->second.bytes;
-            used_buffers--;
+void* memAllocUser(const size_t &bytes)
+{
+    void *ptr = nullptr;
 
-        } else {
-            freeWrapper(ptr); // Free it because we are not sure what the size is
-        }
+    try {
+        ptr = getMemoryManager().alloc(bytes, true);
+    } catch(...) {
+        getQueue().sync();
+        ptr = getMemoryManager().alloc(bytes, true);
     }
+    return ptr;
+}
 
-    template<typename T>
-    void memPop(const T *ptr)
-    {
-        std::lock_guard<std::mutex> lock(memory_map_mutex);
+template<typename T>
+void memFree(T *ptr)
+{
+    return getMemoryManager().unlock((void *)ptr, false);
+}
 
-        mem_iter iter = memory_map.find((void *)ptr);
+void memFreeUser(void *ptr)
+{
+    getMemoryManager().unlock((void *)ptr, true);
+}
 
-        if (iter != memory_map.end()) {
-            iter->second.is_unlinked = true;
-        } else {
-            mem_info info = { false,
-                              true,
-                              100 }; //This number is not relevant
+void memLock(const void *ptr)
+{
+    getMemoryManager().userLock((void *)ptr);
+}
 
-            memory_map[(void *)ptr] = info;
-        }
-    }
+void memUnlock(const void *ptr)
+{
+    getMemoryManager().userUnlock((void *)ptr);
+}
 
-    template<typename T>
-    void memPush(const T *ptr)
-    {
-        std::lock_guard<std::mutex> lock(memory_map_mutex);
-        mem_iter iter = memory_map.find((void *)ptr);
-        if (iter != memory_map.end()) {
-            iter->second.is_unlinked = false;
-        }
-    }
+void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
+                      size_t *lock_bytes,  size_t *lock_buffers)
+{
+    getQueue().sync();
+    getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers,
+                                  lock_bytes,  lock_buffers);
+}
 
+template<typename T>
+T* pinnedAlloc(const size_t &elements)
+{
+    return (T *)getMemoryManager().alloc(elements * sizeof(T), false);
+}
 
-    void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
-                          size_t *lock_bytes,  size_t *lock_buffers)
-    {
-        if (alloc_bytes   ) *alloc_bytes   = total_bytes;
-        if (alloc_buffers ) *alloc_buffers = memory_map.size();
-        if (lock_bytes    ) *lock_bytes    = used_bytes;
-        if (lock_buffers  ) *lock_buffers  = used_buffers;
-    }
+template<typename T>
+void pinnedFree(T* ptr)
+{
+    return getMemoryManager().unlock((void *)ptr, false);
+}
 
-    template<typename T>
-    T* pinnedAlloc(const size_t &elements)
-    {
-        return memAlloc<T>(elements);
-    }
+bool checkMemoryLimit()
+{
+    return getMemoryManager().checkMemoryLimit();
+}
 
-    template<typename T>
-    void pinnedFree(T* ptr)
-    {
-        memFree<T>(ptr);
-    }
+#define INSTANTIATE(T)                                      \
+    template T* memAlloc(const size_t &elements);           \
+    template void memFree(T* ptr);                          \
+    template T* pinnedAlloc(const size_t &elements);        \
+    template void pinnedFree(T* ptr);                       \
+
+INSTANTIATE(float)
+INSTANTIATE(cfloat)
+INSTANTIATE(double)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(ushort)
+INSTANTIATE(short )
 
-#define INSTANTIATE(T)                                  \
-    template T* memAlloc(const size_t &elements);       \
-    template void memFree(T* ptr);                      \
-    template void memPop(const T* ptr);                 \
-    template void memPush(const T* ptr);                \
-    template T* pinnedAlloc(const size_t &elements);    \
-    template void pinnedFree(T* ptr);                   \
-
-    INSTANTIATE(float)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(double)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(ushort)
-    INSTANTIATE(short )
 }
diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp
index 0b1c960ed4..91116fbcfc 100644
--- a/src/backend/cpu/memory.hpp
+++ b/src/backend/cpu/memory.hpp
@@ -9,24 +9,35 @@
 #pragma once
 
 #include <af/defines.h>
+
 namespace cpu
 {
     template<typename T> T* memAlloc(const size_t &elements);
+    void *memAllocUser(const size_t &bytes);
+
+    // Need these as 2 separate function and not a default argument
+    // This is because it is used as the deleter in shared pointer
+    // which cannot support default arguments
     template<typename T> void memFree(T* ptr);
-    template<typename T> void memPop(const T *ptr);
-    template<typename T> void memPush(const T *ptr);
+    void memFreeUser(void* ptr);
+
+    void memLock(const void *ptr);
+    void memUnlock(const void *ptr);
 
     template<typename T> T* pinnedAlloc(const size_t &elements);
     template<typename T> void pinnedFree(T* ptr);
 
-    static const unsigned MAX_BUFFERS = 100;
-    static const unsigned MAX_BYTES = 100 * (1 << 20);
+    size_t getMaxBytes();
+    unsigned getMaxBuffers();
 
     void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
                           size_t *lock_bytes,  size_t *lock_buffers);
     void garbageCollect();
     void pinnedGarbageCollect();
 
+    void printMemInfo(const char *msg, const int device);
+
     void setMemStepSize(size_t step_bytes);
     size_t getMemStepSize(void);
+    bool checkMemoryLimit();
 }
diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp
index eb2e1de339..1ae4680b9d 100644
--- a/src/backend/cpu/morph.cpp
+++ b/src/backend/cpu/morph.cpp
@@ -13,78 +13,24 @@
 #include <Array.hpp>
 #include <morph.hpp>
 #include <algorithm>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/morph.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-static inline unsigned getIdx(const dim4 &strides,
-        int i, int j = 0, int k = 0, int l = 0)
-{
-    return (l * strides[3] +
-            k * strides[2] +
-            j * strides[1] +
-            i * strides[0]);
-}
-
 template<typename T, bool isDilation>
 Array<T> morph(const Array<T> &in, const Array<T> &mask)
 {
-    const dim4 dims       = in.dims();
-    const dim4 window     = mask.dims();
-    const dim_t R0     = window[0]/2;
-    const dim_t R1     = window[1]/2;
-    const dim4 istrides   = in.strides();
-    const dim4 fstrides   = mask.strides();
-
-    Array<T> out         = createEmptyArray<T>(dims);
-    const dim4 ostrides   = out.strides();
-
-    T* outData            = out.get();
-    const T*   inData     = in.get();
-    const T*   filter     = mask.get();
-
-    for(dim_t b3=0; b3<dims[3]; ++b3) {
-        for(dim_t b2=0; b2<dims[2]; ++b2) {
-            // either channels or batch is handled by outer most loop
-            for(dim_t j=0; j<dims[1]; ++j) {
-                // j steps along 2nd dimension
-                for(dim_t i=0; i<dims[0]; ++i) {
-                    // i steps along 1st dimension
-                    T filterResult = inData[ getIdx(istrides, i, j) ];
-
-                    // wj,wi steps along 2nd & 1st dimensions of filter window respectively
-                    for(dim_t wj=0; wj<window[1]; wj++) {
-                        for(dim_t wi=0; wi<window[0]; wi++) {
-
-                            dim_t offj = j+wj-R1;
-                            dim_t offi = i+wi-R0;
-
-                            T maskValue = filter[ getIdx(fstrides, wi, wj) ];
-
-                            if ((maskValue > (T)0) && offi>=0 && offj>=0 && offi<dims[0] && offj<dims[1]) {
-
-                                T inValue   = inData[ getIdx(istrides, offi, offj) ];
+    in.eval();
+    mask.eval();
 
-                                if (isDilation)
-                                    filterResult = std::max(filterResult, inValue);
-                                else
-                                    filterResult = std::min(filterResult, inValue);
-                            }
+    Array<T> out = createEmptyArray<T>(in.dims());
 
-                        } // window 1st dimension loop ends here
-                    } // filter window loop ends here
-
-                    outData[ getIdx(ostrides, i, j) ] = filterResult;
-                } //1st dimension loop ends here
-            } // 2nd dimension loop ends here
-
-            // next iteration will be next batch if any
-            outData += ostrides[2];
-            inData  += istrides[2];
-        }
-    }
+    getQueue().enqueue(kernel::morph<T, isDilation>, out, in, mask);
 
     return out;
 }
@@ -92,66 +38,12 @@ Array<T> morph(const Array<T> &in, const Array<T> &mask)
 template<typename T, bool isDilation>
 Array<T> morph3d(const Array<T> &in, const Array<T> &mask)
 {
-    const dim4 dims       = in.dims();
-    const dim4 window     = mask.dims();
-    const dim_t R0     = window[0]/2;
-    const dim_t R1     = window[1]/2;
-    const dim_t R2     = window[2]/2;
-    const dim4 istrides   = in.strides();
-    const dim4 fstrides   = mask.strides();
-    const dim_t bCount = dims[3];
-
-    Array<T> out         = createEmptyArray<T>(dims);
-    const dim4 ostrides   = out.strides();
-
-    T* outData            = out.get();
-    const T*   inData     = in.get();
-    const T*   filter     = mask.get();
-
-    for(dim_t batchId=0; batchId<bCount; ++batchId) {
-        // either channels or batch is handled by outer most loop
-        for(dim_t k=0; k<dims[2]; ++k) {
-            // k steps along 3rd dimension
-            for(dim_t j=0; j<dims[1]; ++j) {
-                // j steps along 2nd dimension
-                for(dim_t i=0; i<dims[0]; ++i) {
-                    // i steps along 1st dimension
-                    T filterResult = inData[ getIdx(istrides, i, j, k) ];
-
-                    // wk, wj,wi steps along 2nd & 1st dimensions of filter window respectively
-                    for(dim_t wk=0; wk<window[2]; wk++) {
-                        for(dim_t wj=0; wj<window[1]; wj++) {
-                            for(dim_t wi=0; wi<window[0]; wi++) {
-
-                                dim_t offk = k+wk-R2;
-                                dim_t offj = j+wj-R1;
-                                dim_t offi = i+wi-R0;
-
-                                T maskValue = filter[ getIdx(fstrides, wi, wj, wk) ];
-
-                                if ((maskValue > (T)0) && offi>=0 && offj>=0 && offk>=0 &&
-                                        offi<dims[0] && offj<dims[1] && offk<dims[2]) {
-
-                                    T inValue   = inData[ getIdx(istrides, offi, offj, offk) ];
-
-                                    if (isDilation)
-                                        filterResult = std::max(filterResult, inValue);
-                                    else
-                                        filterResult = std::min(filterResult, inValue);
-                                }
+    in.eval();
+    mask.eval();
 
-                            } // window 1st dimension loop ends here
-                        }  // window 1st dimension loop ends here
-                    }// filter window loop ends here
+    Array<T> out = createEmptyArray<T>(in.dims());
 
-                    outData[ getIdx(ostrides, i, j, k) ] = filterResult;
-                } //1st dimension loop ends here
-            } // 2nd dimension loop ends here
-        } // 3rd dimension loop ends here
-        // next iteration will be next batch if any
-        outData += ostrides[3];
-        inData  += istrides[3];
-    }
+    getQueue().enqueue(kernel::morph3d<T, isDilation>, out, in, mask);
 
     return out;
 }
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp
index 79d41516e3..17e892f492 100644
--- a/src/backend/cpu/nearest_neighbour.cpp
+++ b/src/backend/cpu/nearest_neighbour.cpp
@@ -11,157 +11,50 @@
 #include <af/defines.h>
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
 #include <handle.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/nearest_neighbour.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-#if defined(_WIN32) || defined(_MSC_VER)
-
-#include <intrin.h>
-#define __builtin_popcount __popcnt
-
-#endif
-
-template<typename T, typename To, af_match_type dist_type>
-struct dist_op
-{
-    To operator()(T v1, T v2)
-    {
-        return v1 - v2;     // Garbage distance
-    }
-};
-
 template<typename T, typename To>
-struct dist_op<T, To, AF_SAD>
-{
-    To operator()(T v1, T v2)
-    {
-        return std::abs((double)v1 - (double)v2);
-    }
-};
-
-template<typename T, typename To>
-struct dist_op<T, To, AF_SSD>
-{
-    To operator()(T v1, T v2)
-    {
-        return (v1 - v2) * (v1 - v2);
-    }
-};
-
-template<typename To>
-struct dist_op<uint, To, AF_SHD>
-{
-    To operator()(uint v1, uint v2)
-    {
-        return __builtin_popcount(v1 ^ v2);
-    }
-};
-
-template<typename To>
-struct dist_op<uintl, To, AF_SHD>
-{
-    To operator()(uintl v1, uintl v2)
-    {
-        return __builtin_popcount(v1 ^ v2);
-    }
-};
-
-template<typename To>
-struct dist_op<uchar, To, AF_SHD>
-{
-    To operator()(uchar v1, uchar v2)
-    {
-        return __builtin_popcount(v1 ^ v2);
-    }
-};
-
-template<typename To>
-struct dist_op<ushort, To, AF_SHD>
-{
-    To operator()(ushort v1, ushort v2)
-    {
-        return __builtin_popcount(v1 ^ v2);
-    }
-};
-
-template<typename T, typename To, af_match_type dist_type>
-void nearest_neighbour_(Array<uint>& idx, Array<To>& dist,
-                        const Array<T>& query, const Array<T>& train,
-                        const uint dist_dim, const uint n_dist)
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+                       const Array<T>& query, const Array<T>& train,
+                       const uint dist_dim, const uint n_dist,
+                       const af_match_type dist_type)
 {
-    uint sample_dim = (dist_dim == 0) ? 1 : 0;
-    const dim4 qDims = query.dims();
-    const dim4 tDims = train.dims();
-
     if (n_dist > 1) {
         CPU_NOT_SUPPORTED();
     }
 
-    const unsigned distLength = qDims[dist_dim];
-    const unsigned nQuery = qDims[sample_dim];
-    const unsigned nTrain = tDims[sample_dim];
+    idx.eval();
+    dist.eval();
+    query.eval();
+    train.eval();
 
-    const dim4 outDims(n_dist, nQuery);
+    uint sample_dim  = (dist_dim == 0) ? 1 : 0;
+    const dim4 qDims = query.dims();
+    const dim4 outDims(n_dist, qDims[sample_dim]);
 
     idx  = createEmptyArray<uint>(outDims);
     dist = createEmptyArray<To  >(outDims);
 
-    const T* qPtr = query.get();
-    const T* tPtr = train.get();
-    uint* iPtr = idx.get();
-    To* dPtr = dist.get();
-
-    dist_op<T, To, dist_type> op;
-
-    for (unsigned i = 0; i < nQuery; i++) {
-        To best_dist = limit_max<To>();
-        unsigned best_idx  = 0;
-
-        for (unsigned j = 0; j < nTrain; j++) {
-            To local_dist = 0;
-            for (unsigned k = 0; k < distLength; k++) {
-                size_t qIdx, tIdx;
-                if (sample_dim == 0) {
-                    qIdx = k * qDims[0] + i;
-                    tIdx = k * tDims[0] + j;
-                }
-                else {
-                    qIdx = i * qDims[0] + k;
-                    tIdx = j * tDims[0] + k;
-                }
-
-                local_dist += op(qPtr[qIdx], tPtr[tIdx]);
-            }
-
-            if (local_dist < best_dist) {
-                best_dist = local_dist;
-                best_idx  = j;
-            }
-        }
-
-        size_t oIdx;
-        oIdx = i;
-        iPtr[oIdx] = best_idx;
-        dPtr[oIdx] = best_dist;
-    }
-}
-
-template<typename T, typename To>
-void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
-                       const Array<T>& query, const Array<T>& train,
-                       const uint dist_dim, const uint n_dist,
-                       const af_match_type dist_type)
-{
     switch(dist_type) {
-        case AF_SAD: nearest_neighbour_<T, To, AF_SAD>(idx, dist, query, train, dist_dim, n_dist); break;
-        case AF_SSD: nearest_neighbour_<T, To, AF_SSD>(idx, dist, query, train, dist_dim, n_dist); break;
-        case AF_SHD: nearest_neighbour_<T, To, AF_SHD>(idx, dist, query, train, dist_dim, n_dist); break;
-        default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
+        case AF_SAD:
+            getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SAD>, idx, dist, query, train, dist_dim, n_dist);
+            break;
+        case AF_SSD:
+            getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SSD>, idx, dist, query, train, dist_dim, n_dist);
+            break;
+        case AF_SHD:
+            getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SHD>, idx, dist, query, train, dist_dim, n_dist);
+            break;
+        default:
+            AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
     }
 }
 
diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp
index d279ba514f..8bbfd41932 100644
--- a/src/backend/cpu/orb.cpp
+++ b/src/backend/cpu/orb.cpp
@@ -11,7 +11,6 @@
 #include <af/defines.h>
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
 #include <handle.hpp>
 #include <resize.hpp>
 #include <fast.hpp>
@@ -19,520 +18,15 @@
 #include <convolve.hpp>
 #include <memory.hpp>
 #include <cstring>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/orb.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-static const float PI_VAL = 3.14159265358979323846f;
-
-// Reference pattern, generated for a patch size of 31x31, as suggested by
-// original ORB paper
-#define REF_PAT_SIZE 31
-#define REF_PAT_SAMPLES 256
-#define REF_PAT_COORDS 4
-#define REF_PAT_LENGTH (REF_PAT_SAMPLES*REF_PAT_COORDS)
-
-// Current reference pattern was borrowed from OpenCV, to build a pattern with
-// similar quality, a training process must be applied, as described in
-// sections 4.2 and 4.3 of the original ORB paper.
-const int ref_pat[REF_PAT_LENGTH] = {
-    8,-3, 9,5,
-    4,2, 7,-12,
-    -11,9, -8,2,
-    7,-12, 12,-13,
-    2,-13, 2,12,
-    1,-7, 1,6,
-    -2,-10, -2,-4,
-    -13,-13, -11,-8,
-    -13,-3, -12,-9,
-    10,4, 11,9,
-    -13,-8, -8,-9,
-    -11,7, -9,12,
-    7,7, 12,6,
-    -4,-5, -3,0,
-    -13,2, -12,-3,
-    -9,0, -7,5,
-    12,-6, 12,-1,
-    -3,6, -2,12,
-    -6,-13, -4,-8,
-    11,-13, 12,-8,
-    4,7, 5,1,
-    5,-3, 10,-3,
-    3,-7, 6,12,
-    -8,-7, -6,-2,
-    -2,11, -1,-10,
-    -13,12, -8,10,
-    -7,3, -5,-3,
-    -4,2, -3,7,
-    -10,-12, -6,11,
-    5,-12, 6,-7,
-    5,-6, 7,-1,
-    1,0, 4,-5,
-    9,11, 11,-13,
-    4,7, 4,12,
-    2,-1, 4,4,
-    -4,-12, -2,7,
-    -8,-5, -7,-10,
-    4,11, 9,12,
-    0,-8, 1,-13,
-    -13,-2, -8,2,
-    -3,-2, -2,3,
-    -6,9, -4,-9,
-    8,12, 10,7,
-    0,9, 1,3,
-    7,-5, 11,-10,
-    -13,-6, -11,0,
-    10,7, 12,1,
-    -6,-3, -6,12,
-    10,-9, 12,-4,
-    -13,8, -8,-12,
-    -13,0, -8,-4,
-    3,3, 7,8,
-    5,7, 10,-7,
-    -1,7, 1,-12,
-    3,-10, 5,6,
-    2,-4, 3,-10,
-    -13,0, -13,5,
-    -13,-7, -12,12,
-    -13,3, -11,8,
-    -7,12, -4,7,
-    6,-10, 12,8,
-    -9,-1, -7,-6,
-    -2,-5, 0,12,
-    -12,5, -7,5,
-    3,-10, 8,-13,
-    -7,-7, -4,5,
-    -3,-2, -1,-7,
-    2,9, 5,-11,
-    -11,-13, -5,-13,
-    -1,6, 0,-1,
-    5,-3, 5,2,
-    -4,-13, -4,12,
-    -9,-6, -9,6,
-    -12,-10, -8,-4,
-    10,2, 12,-3,
-    7,12, 12,12,
-    -7,-13, -6,5,
-    -4,9, -3,4,
-    7,-1, 12,2,
-    -7,6, -5,1,
-    -13,11, -12,5,
-    -3,7, -2,-6,
-    7,-8, 12,-7,
-    -13,-7, -11,-12,
-    1,-3, 12,12,
-    2,-6, 3,0,
-    -4,3, -2,-13,
-    -1,-13, 1,9,
-    7,1, 8,-6,
-    1,-1, 3,12,
-    9,1, 12,6,
-    -1,-9, -1,3,
-    -13,-13, -10,5,
-    7,7, 10,12,
-    12,-5, 12,9,
-    6,3, 7,11,
-    5,-13, 6,10,
-    2,-12, 2,3,
-    3,8, 4,-6,
-    2,6, 12,-13,
-    9,-12, 10,3,
-    -8,4, -7,9,
-    -11,12, -4,-6,
-    1,12, 2,-8,
-    6,-9, 7,-4,
-    2,3, 3,-2,
-    6,3, 11,0,
-    3,-3, 8,-8,
-    7,8, 9,3,
-    -11,-5, -6,-4,
-    -10,11, -5,10,
-    -5,-8, -3,12,
-    -10,5, -9,0,
-    8,-1, 12,-6,
-    4,-6, 6,-11,
-    -10,12, -8,7,
-    4,-2, 6,7,
-    -2,0, -2,12,
-    -5,-8, -5,2,
-    7,-6, 10,12,
-    -9,-13, -8,-8,
-    -5,-13, -5,-2,
-    8,-8, 9,-13,
-    -9,-11, -9,0,
-    1,-8, 1,-2,
-    7,-4, 9,1,
-    -2,1, -1,-4,
-    11,-6, 12,-11,
-    -12,-9, -6,4,
-    3,7, 7,12,
-    5,5, 10,8,
-    0,-4, 2,8,
-    -9,12, -5,-13,
-    0,7, 2,12,
-    -1,2, 1,7,
-    5,11, 7,-9,
-    3,5, 6,-8,
-    -13,-4, -8,9,
-    -5,9, -3,-3,
-    -4,-7, -3,-12,
-    6,5, 8,0,
-    -7,6, -6,12,
-    -13,6, -5,-2,
-    1,-10, 3,10,
-    4,1, 8,-4,
-    -2,-2, 2,-13,
-    2,-12, 12,12,
-    -2,-13, 0,-6,
-    4,1, 9,3,
-    -6,-10, -3,-5,
-    -3,-13, -1,1,
-    7,5, 12,-11,
-    4,-2, 5,-7,
-    -13,9, -9,-5,
-    7,1, 8,6,
-    7,-8, 7,6,
-    -7,-4, -7,1,
-    -8,11, -7,-8,
-    -13,6, -12,-8,
-    2,4, 3,9,
-    10,-5, 12,3,
-    -6,-5, -6,7,
-    8,-3, 9,-8,
-    2,-12, 2,8,
-    -11,-2, -10,3,
-    -12,-13, -7,-9,
-    -11,0, -10,-5,
-    5,-3, 11,8,
-    -2,-13, -1,12,
-    -1,-8, 0,9,
-    -13,-11, -12,-5,
-    -10,-2, -10,11,
-    -3,9, -2,-13,
-    2,-3, 3,2,
-    -9,-13, -4,0,
-    -4,6, -3,-10,
-    -4,12, -2,-7,
-    -6,-11, -4,9,
-    6,-3, 6,11,
-    -13,11, -5,5,
-    11,11, 12,6,
-    7,-5, 12,-2,
-    -1,12, 0,7,
-    -4,-8, -3,-2,
-    -7,1, -6,7,
-    -13,-12, -8,-13,
-    -7,-2, -6,-8,
-    -8,5, -6,-9,
-    -5,-1, -4,5,
-    -13,7, -8,10,
-    1,5, 5,-13,
-    1,0, 10,-13,
-    9,12, 10,-1,
-    5,-8, 10,-9,
-    -1,11, 1,-13,
-    -9,-3, -6,2,
-    -1,-10, 1,12,
-    -13,1, -8,-10,
-    8,-11, 10,-6,
-    2,-13, 3,-6,
-    7,-13, 12,-9,
-    -10,-10, -5,-7,
-    -10,-8, -8,-13,
-    4,-6, 8,5,
-    3,12, 8,-13,
-    -4,2, -3,-3,
-    5,-13, 10,-12,
-    4,-13, 5,-1,
-    -9,9, -4,3,
-    0,3, 3,-9,
-    -12,1, -6,1,
-    3,2, 4,-8,
-    -10,-10, -10,9,
-    8,-13, 12,12,
-    -8,-12, -6,-5,
-    2,2, 3,7,
-    10,6, 11,-8,
-    6,8, 8,-12,
-    -7,10, -6,5,
-    -3,-9, -3,9,
-    -1,-13, -1,5,
-    -3,-7, -3,4,
-    -8,-2, -8,3,
-    4,2, 12,12,
-    2,-5, 3,11,
-    6,-9, 11,-13,
-    3,-1, 7,12,
-    11,-1, 12,4,
-    -3,0, -3,6,
-    4,-11, 4,12,
-    2,-4, 2,1,
-    -10,-6, -8,1,
-    -13,7, -11,1,
-    -13,12, -11,-13,
-    6,0, 11,-13,
-    0,-1, 1,4,
-    -13,3, -9,-2,
-    -9,8, -6,-3,
-    -13,-6, -8,-2,
-    5,-9, 8,10,
-    2,7, 3,-9,
-    -1,-6, -1,-1,
-    9,5, 11,-2,
-    11,-3, 12,-8,
-    3,0, 3,5,
-    -1,4, 0,10,
-    3,-6, 4,5,
-    -13,0, -10,5,
-    5,8, 12,11,
-    8,9, 9,-6,
-    7,-4, 8,-12,
-    -10,4, -10,9,
-    7,3, 12,4,
-    9,-7, 10,-2,
-    7,0, 12,-2,
-    -1,-6, 0,-11,
-};
-
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
-    if(!(sigma>0)) sigma = 0.25*dim;
-
-    T sum = (T)0;
-    for(int i=0;i<dim;i++)
-    {
-        int x = i-(dim-1)/2;
-        T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-        out[i] = el;
-        sum   += el;
-    }
-
-    for(int k=0;k<dim;k++)
-        out[k] /= sum;
-}
-
-template<typename T>
-void keep_features(
-    float* x_out,
-    float* y_out,
-    float* score_out,
-    float* size_out,
-    const float* x_in,
-    const float* y_in,
-    const float* score_in,
-    const unsigned* score_idx,
-    const float* size_in,
-    const unsigned n_feat)
-{
-    // Keep only the first n_feat features
-    for (unsigned f = 0; f < n_feat; f++) {
-        x_out[f] = x_in[score_idx[f]];
-        y_out[f] = y_in[score_idx[f]];
-        score_out[f] = score_in[f];
-        if (size_in != nullptr && size_out != nullptr)
-            size_out[f] = size_in[score_idx[f]];
-    }
-}
-
-template<typename T, bool use_scl>
-void harris_response(
-    float* x_out,
-    float* y_out,
-    float* score_out,
-    float* size_out,
-    const float* x_in,
-    const float* y_in,
-    const float* scl_in,
-    const unsigned total_feat,
-    unsigned* usable_feat,
-    const Array<T>& image,
-    const unsigned block_size,
-    const float k_thr,
-    const unsigned patch_size)
-{
-    const af::dim4 idims = image.dims();
-    const T* image_ptr = image.get();
-    for (unsigned f = 0; f < total_feat; f++) {
-        unsigned x, y;
-        float scl = 1.f;
-        if (use_scl) {
-            // Update x and y coordinates according to scale
-            scl = scl_in[f];
-            x = (unsigned)round(x_in[f] * scl);
-            y = (unsigned)round(y_in[f] * scl);
-        }
-        else {
-            x = (unsigned)round(x_in[f]);
-            y = (unsigned)round(y_in[f]);
-        }
-
-        // Round feature size to nearest odd integer
-        float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f;
-
-        // Avoid keeping features that might be too wide and might not fit on
-        // the image, sqrt(2.f) is the radius when angle is 45 degrees and
-        // represents widest case possible
-        unsigned patch_r = ceil(size * sqrt(2.f) / 2.f);
-        if (x < patch_r || y < patch_r || x >= idims[1] - patch_r || y >= idims[0] - patch_r)
-            continue;
-
-        unsigned r = block_size / 2;
-
-        float ixx = 0.f, iyy = 0.f, ixy = 0.f;
-        unsigned block_size_sq = block_size * block_size;
-        for (unsigned k = 0; k < block_size_sq; k++) {
-            int i = k / block_size - r;
-            int j = k % block_size - r;
-
-            // Calculate local x and y derivatives
-            float ix = image_ptr[(x+i+1) * idims[0] + y+j] - image_ptr[(x+i-1) * idims[0] + y+j];
-            float iy = image_ptr[(x+i) * idims[0] + y+j+1] - image_ptr[(x+i) * idims[0] + y+j-1];
-
-            // Accumulate second order derivatives
-            ixx += ix*ix;
-            iyy += iy*iy;
-            ixy += ix*iy;
-        }
-
-        unsigned idx = *usable_feat;
-        *usable_feat += 1;
-        float tr = ixx + iyy;
-        float det = ixx*iyy - ixy*ixy;
-
-        // Calculate Harris responses
-        float resp = det - k_thr * (tr*tr);
-
-        // Scale factor
-        // TODO: improve response scaling
-        float rscale = 0.001f;
-        rscale = rscale * rscale * rscale * rscale;
-
-        x_out[idx] = x;
-        y_out[idx] = y;
-        score_out[idx] = resp * rscale;
-        if (use_scl)
-            size_out[idx] = size;
-    }
-}
-
-template<typename T>
-void centroid_angle(
-    const float* x_in,
-    const float* y_in,
-    float* orientation_out,
-    const unsigned total_feat,
-    const Array<T>& image,
-    const unsigned patch_size)
-{
-    const af::dim4 idims = image.dims();
-    const T* image_ptr = image.get();
-    for (unsigned f = 0; f < total_feat; f++) {
-        unsigned x = (unsigned)round(x_in[f]);
-        unsigned y = (unsigned)round(y_in[f]);
-
-        unsigned r = patch_size / 2;
-        if (x < r || y < r || x > idims[1] - r || y > idims[0] - r)
-            continue;
-
-        T m01 = (T)0, m10 = (T)0;
-        unsigned patch_size_sq = patch_size * patch_size;
-        for (unsigned k = 0; k < patch_size_sq; k++) {
-            int i = k / patch_size - r;
-            int j = k % patch_size - r;
-
-            // Calculate first order moments
-            T p = image_ptr[(x+i) * idims[0] + y+j];
-            m01 += j * p;
-            m10 += i * p;
-        }
-
-        float angle = atan2(m01, m10);
-        orientation_out[f] = angle;
-    }
-}
-
-template<typename T>
-inline T get_pixel(
-    unsigned x,
-    unsigned y,
-    const float ori,
-    const unsigned size,
-    const int dist_x,
-    const int dist_y,
-    const Array<T>& image,
-    const unsigned patch_size)
-{
-    const af::dim4 idims = image.dims();
-    const T* image_ptr = image.get();
-    float ori_sin = sin(ori);
-    float ori_cos = cos(ori);
-    float patch_scl = (float)size / (float)patch_size;
-
-    // Calculate point coordinates based on orientation and size
-    x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
-    y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
-
-    return image_ptr[x * idims[0] + y];
-}
-
-template<typename T>
-void extract_orb(
-    unsigned* desc_out,
-    const unsigned n_feat,
-    float* x_in_out,
-    float* y_in_out,
-    const float* ori_in,
-    float* size_out,
-    const Array<T>& image,
-    const float scl,
-    const unsigned patch_size)
-{
-    const af::dim4 idims = image.dims();
-    for (unsigned f = 0; f < n_feat; f++) {
-        unsigned x = (unsigned)round(x_in_out[f]);
-        unsigned y = (unsigned)round(y_in_out[f]);
-        float ori = ori_in[f];
-        unsigned size = patch_size;
-
-        unsigned r = ceil(patch_size * sqrt(2.f) / 2.f);
-        if (x < r || y < r || x >= idims[1] - r || y >= idims[0] - r)
-            continue;
-
-        // Descriptor fixed at 256 bits for now
-        // Storing descriptor as a vector of 8 x 32-bit unsigned numbers
-        for (unsigned i = 0; i < 8; i++) {
-            unsigned v = 0;
-
-            // j < 32 for 256 bits descriptor
-            for (unsigned j = 0; j < 32; j++) {
-                // Get position from distribution pattern and values of points p1 and p2
-                int dist_x = ref_pat[i*32*4 + j*4];
-                int dist_y = ref_pat[i*32*4 + j*4+1];
-                T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);
-
-                dist_x = ref_pat[i*32*4 + j*4+2];
-                dist_y = ref_pat[i*32*4 + j*4+3];
-                T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);
-
-                // Calculate bit based on p1 and p2 and shifts it to correct position
-                v |= (p1 < p2) << j;
-            }
-
-            // Store 32 bits of descriptor
-            desc_out[f * 8 + i] += v;
-        }
-
-        x_in_out[f] = round(x * scl);
-        y_in_out[f] = round(y * scl);
-        size_out[f] = patch_size * scl;
-    }
-}
-
-
-
 template<typename T, typename convAccT>
 unsigned orb(Array<float> &x, Array<float> &y,
              Array<float> &score, Array<float> &ori,
@@ -542,6 +36,8 @@ unsigned orb(Array<float> &x, Array<float> &y,
              const float scl_fctr, const unsigned levels,
              const bool blur_img)
 {
+    image.eval();
+    getQueue().sync();
 
     unsigned patch_size = REF_PAT_SIZE;
 
@@ -611,6 +107,9 @@ unsigned orb(Array<float> &x, Array<float> &y,
             prev_img = lvl_img;
             prev_ldims = lvl_img.dims();
         }
+        prev_img.eval();
+        lvl_img.eval();
+        getQueue().sync();
 
 
         Array<float> x_feat = createEmptyArray<float>(dim4());
@@ -628,7 +127,6 @@ unsigned orb(Array<float> &x, Array<float> &y,
         unsigned lvl_feat = fast(x_feat, y_feat, score_feat,
                                  lvl_img, fast_thr, 9, 1, 0.15f, edge);
 
-
         if (lvl_feat == 0) {
             continue;
         }
@@ -643,7 +141,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
         // Calculate Harris responses
         // Good block_size >= 7 (must be an odd number)
         unsigned usable_feat = 0;
-        harris_response<T, false>(h_x_harris, h_y_harris, h_score_harris, nullptr,
+        kernel::harris_response<T, false>(h_x_harris, h_y_harris, h_score_harris, nullptr,
                                   h_x_feat, h_y_feat, nullptr,
                                   lvl_feat, &usable_feat,
                                   lvl_img,
@@ -653,7 +151,6 @@ unsigned orb(Array<float> &x, Array<float> &y,
             memFree(h_x_harris);
             memFree(h_y_harris);
             memFree(h_score_harris);
-
             continue;
         }
 
@@ -664,13 +161,13 @@ unsigned orb(Array<float> &x, Array<float> &y,
         Array<unsigned> harris_idx = createEmptyArray<unsigned>(af::dim4());
 
         sort_index<float, false>(harris_sorted, harris_idx, score_harris, 0);
+        getQueue().sync();
 
         usable_feat = std::min(usable_feat, lvl_best[i]);
 
         if (usable_feat == 0) {
             memFree(h_x_harris);
             memFree(h_y_harris);
-
             continue;
         }
 
@@ -679,7 +176,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
         float* h_score_lvl = memAlloc<float>(usable_feat);
 
         // Keep only features with higher Harris responses
-        keep_features<T>(h_x_lvl, h_y_lvl, h_score_lvl, nullptr,
+        kernel::keep_features<T>(h_x_lvl, h_y_lvl, h_score_lvl, nullptr,
                          h_x_harris, h_y_harris, harris_sorted.get(), harris_idx.get(),
                          nullptr, usable_feat);
 
@@ -690,7 +187,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
         float* h_size_lvl = memAlloc<float>(usable_feat);
 
         // Compute orientation of features
-        centroid_angle<T>(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat,
+        kernel::centroid_angle<T>(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat,
                           lvl_img, patch_size);
 
         Array<T> lvl_filt = createEmptyArray<T>(dim4());
@@ -701,21 +198,24 @@ unsigned orb(Array<float> &x, Array<float> &y,
                 h_gauss = memAlloc<T>(gauss_dims[0]);
                 gaussian1D(h_gauss, gauss_dims[0], 2.f);
                 gauss_filter = createDeviceDataArray<T>(gauss_dims, h_gauss);
+                gauss_filter.eval();
             }
 
             // Filter level image with Gaussian kernel to reduce noise sensitivity
             lvl_filt = convolve2<T, convAccT, false>(lvl_img, gauss_filter, gauss_filter);
         }
+        lvl_filt.eval();
+        getQueue().sync();
 
         // Compute ORB descriptors
         unsigned* h_desc_lvl = memAlloc<unsigned>(usable_feat * 8);
         memset(h_desc_lvl, 0, usable_feat * 8 * sizeof(unsigned));
         if (blur_img)
-            extract_orb<T>(h_desc_lvl, usable_feat,
+            kernel::extract_orb<T>(h_desc_lvl, usable_feat,
                            h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
                            lvl_filt, lvl_scl, patch_size);
         else
-            extract_orb<T>(h_desc_lvl, usable_feat,
+            kernel::extract_orb<T>(h_desc_lvl, usable_feat,
                            h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
                            lvl_img, lvl_scl, patch_size);
 
diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp
index 4d96a37fbb..9474c792f3 100644
--- a/src/backend/cpu/platform.cpp
+++ b/src/backend/cpu/platform.cpp
@@ -11,11 +11,16 @@
 #include <af/defines.h>
 #include <platform.hpp>
 #include <sstream>
+#include <queue.hpp>
+#include <array>
 #include <algorithm>
 #include <iostream>
 #include <string>
 #include <defines.hpp>
 #include <version.hpp>
+#include <queue.hpp>
+#include <host_memory.hpp>
+#include <cctype>
 
 #ifdef _WIN32
 #include <limits.h>
@@ -175,6 +180,22 @@ CPUInfo::CPUInfo()
 namespace cpu
 {
 
+unsigned getMaxJitSize()
+{
+    const int MAX_JIT_LEN = 20;
+
+    static int length = 0;
+    if (length == 0) {
+        std::string env_var = getEnvVar("AF_CPU_MAX_JIT_LEN");
+        if (!env_var.empty()) {
+            length = std::stoi(env_var);
+        } else {
+            length = MAX_JIT_LEN;
+        }
+    }
+    return length;
+}
+
 int getBackend()
 {
     return AF_BACKEND_CPU;
@@ -194,14 +215,29 @@ static const std::string get_system(void)
 #endif
 }
 
-std::string getInfo()
+// http://stackoverflow.com/questions/216823/whats-the-best-way-to-trim-stdstring/217605#217605
+// trim from start
+static inline std::string &ltrim(std::string &s)
+{
+    s.erase(s.begin(), std::find_if(s.begin(), s.end(),
+                                    std::not1(std::ptr_fun<int, int>(std::isspace))));
+    return s;
+}
+
+std::string getDeviceInfo()
 {
     std::ostringstream info;
     static CPUInfo cinfo;
 
     info << "ArrayFire v" << AF_VERSION
          << " (CPU, " << get_system() << ", build " << AF_REVISION << ")" << std::endl;
-    info << string("[0] ") << cinfo.vendor() <<": " << cinfo.model() << " ";
+    std::string model = cinfo.model();
+    size_t memMB = getDeviceMemorySize(getActiveDeviceId()) / 1048576;
+    info << string("[0] ") << cinfo.vendor() <<": " << ltrim(model);
+
+    if(memMB) info << ", " << memMB << " MB, ";
+    else      info << ", Unknown MB, ";
+
     info << "Max threads("<< cinfo.threads()<<") ";
 #ifndef NDEBUG
     info << AF_COMPILER_STR;
@@ -234,11 +270,11 @@ int getDeviceCount()
 int setDevice(int device)
 {
     static bool flag;
-    if(!flag) {
-        printf("WARNING: af_set_device not supported for CPU\n");
+    if(!flag && device != 0) {
+        printf("WARNING af_set_device(device): device can only be 0 for CPU\n");
         flag = 1;
     }
-    return 1;
+    return 0;
 }
 
 int getActiveDeviceId()
@@ -246,9 +282,27 @@ int getActiveDeviceId()
     return 0;
 }
 
+size_t getDeviceMemorySize(int device)
+{
+    return common::getHostMemorySize();
+}
+
+size_t getHostMemorySize()
+{
+    return common::getHostMemorySize();
+}
+
+static const int MAX_QUEUES = 1;
+
+
+queue& getQueue(int idx) {
+    static std::array<queue, MAX_QUEUES> queues;
+    return queues[idx];
+}
+
 void sync(int device)
 {
-    // Nothing here
+    getQueue().sync();
 }
 
 }
diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp
index 2e52cd13a6..7caddccc72 100644
--- a/src/backend/cpu/platform.hpp
+++ b/src/backend/cpu/platform.hpp
@@ -7,12 +7,16 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+
 #include <string>
 
 namespace cpu {
+    class queue;
+
     int getBackend();
 
-    std::string getInfo();
+    std::string getDeviceInfo();
 
     bool isDoubleSupported(int device);
 
@@ -24,5 +28,13 @@ namespace cpu {
 
     int getActiveDeviceId();
 
+    size_t getDeviceMemorySize(int device);
+
+    size_t getHostMemorySize();
+
     void sync(int device);
+
+    queue& getQueue(int idx = 0);
+
+    unsigned getMaxJitSize();
 }
diff --git a/src/backend/cpu/plot.cpp b/src/backend/cpu/plot.cpp
index 9de1993f2d..2ab69643c8 100644
--- a/src/backend/cpu/plot.cpp
+++ b/src/backend/cpu/plot.cpp
@@ -12,37 +12,40 @@
 #include <Array.hpp>
 #include <plot.hpp>
 #include <err_cpu.hpp>
-#include <stdexcept>
 #include <graphics_common.hpp>
-#include <reduce.hpp>
-#include <memory.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<typename T>
-    void copy_plot(const Array<T> &P, fg::Plot* plot)
-    {
-        CheckGL("Before CopyArrayToVBO");
-
-        glBindBuffer(GL_ARRAY_BUFFER, plot->vbo());
-        glBufferSubData(GL_ARRAY_BUFFER, 0, plot->size(), P.get());
-        glBindBuffer(GL_ARRAY_BUFFER, 0);
-
-        CheckGL("In CopyArrayToVBO");
-    }
-
-    #define INSTANTIATE(T)  \
-        template void copy_plot<T>(const Array<T> &P, fg::Plot* plot);
-
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+
+template<typename T>
+void copy_plot(const Array<T> &P, fg::Plot* plot)
+{
+    P.eval();
+    getQueue().sync();
+    CheckGL("Before CopyArrayToVBO");
+
+    glBindBuffer(GL_ARRAY_BUFFER, plot->vbo());
+    glBufferSubData(GL_ARRAY_BUFFER, 0, plot->size(), P.get());
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+    CheckGL("In CopyArrayToVBO");
+}
+
+#define INSTANTIATE(T)  \
+    template void copy_plot<T>(const Array<T> &P, fg::Plot* plot);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
 
 #endif  // WITH_GRAPHICS
diff --git a/src/backend/cpu/plot3.cpp b/src/backend/cpu/plot3.cpp
index c0e26aaa34..515fe0336c 100644
--- a/src/backend/cpu/plot3.cpp
+++ b/src/backend/cpu/plot3.cpp
@@ -12,37 +12,40 @@
 #include <Array.hpp>
 #include <plot3.hpp>
 #include <err_cpu.hpp>
-#include <stdexcept>
 #include <graphics_common.hpp>
-#include <reduce.hpp>
-#include <memory.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<typename T>
-    void copy_plot3(const Array<T> &P, fg::Plot3* plot3)
-    {
-        CheckGL("Before CopyArrayToVBO");
-
-        glBindBuffer(GL_ARRAY_BUFFER, plot3->vbo());
-        glBufferSubData(GL_ARRAY_BUFFER, 0, plot3->size(), P.get());
-        glBindBuffer(GL_ARRAY_BUFFER, 0);
-
-        CheckGL("In CopyArrayToVBO");
-    }
-
-    #define INSTANTIATE(T)  \
-        template void copy_plot3<T>(const Array<T> &P, fg::Plot3* plot3);
-
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+
+template<typename T>
+void copy_plot3(const Array<T> &P, fg::Plot3* plot3)
+{
+    P.eval();
+    getQueue().sync();
+    CheckGL("Before CopyArrayToVBO");
+
+    glBindBuffer(GL_ARRAY_BUFFER, plot3->vbo());
+    glBufferSubData(GL_ARRAY_BUFFER, 0, plot3->size(), P.get());
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+    CheckGL("In CopyArrayToVBO");
+}
+
+#define INSTANTIATE(T)  \
+    template void copy_plot3<T>(const Array<T> &P, fg::Plot3* plot3);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
 
 #endif  // WITH_GRAPHICS
diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp
index d1c3e233af..f8dbfa2013 100644
--- a/src/backend/cpu/qr.cpp
+++ b/src/backend/cpu/qr.cpp
@@ -11,28 +11,23 @@
 #include <err_common.hpp>
 
 #if defined(WITH_CPU_LINEAR_ALGEBRA)
-
 #include <af/dim4.hpp>
 #include <handle.hpp>
-#include <iostream>
 #include <cassert>
 #include <err_cpu.hpp>
 #include <triangle.hpp>
-
 #include <lapack_helper.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
 
 template<typename T>
-using geqrf_func_def = int (*)(ORDER_TYPE, int, int,
-                               T*, int,
-                               T*);
+using geqrf_func_def = int (*)(ORDER_TYPE, int, int, T*, int, T*);
 
 template<typename T>
-using gqr_func_def = int (*)(ORDER_TYPE, int, int, int,
-                             T*, int,
-                             const T*);
+using gqr_func_def = int (*)(ORDER_TYPE, int, int, int, T*, int, const T*);
 
 #define QR_FUNC_DEF( FUNC )                                         \
 template<typename T> FUNC##_func_def<T> FUNC##_func();
@@ -64,9 +59,14 @@ GQR_FUNC(gqr , cdouble, zungqr)
 template<typename T>
 void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
 {
+    q.eval();
+    r.eval();
+    t.eval();
+    in.eval();
+
     dim4 iDims = in.dims();
-    int M = iDims[0];
-    int N = iDims[1];
+    int M      = iDims[0];
+    int N      = iDims[1];
 
     q = padArray<T, T>(in, dim4(M, max(M, N)));
     q.resetDims(iDims);
@@ -78,39 +78,31 @@ void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
 
     triangle<T, true, false>(r, q);
 
-    gqr_func<T>()(AF_LAPACK_COL_MAJOR,
-                  M, M, min(M, N),
-                  q.get(), q.strides()[1],
-                  t.get());
-
+    auto func = [=] (Array<T> q, Array<T> t, int M, int N) {
+        gqr_func<T>()(AF_LAPACK_COL_MAJOR, M, M, min(M, N), q.get(), q.strides()[1], t.get());
+    };
     q.resetDims(dim4(M, M));
+    getQueue().enqueue(func, q, t, M, N);
 }
 
 template<typename T>
 Array<T> qr_inplace(Array<T> &in)
 {
-    dim4 iDims = in.dims();
-    int M = iDims[0];
-    int N = iDims[1];
+    in.eval();
 
+    dim4 iDims = in.dims();
+    int M      = iDims[0];
+    int N      = iDims[1];
     Array<T> t = createEmptyArray<T>(af::dim4(min(M, N), 1, 1, 1));
 
-    geqrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
-                    in.get(), in.strides()[1],
-                    t.get());
+    auto func = [=] (Array<T> in, Array<T> t, int M, int N) {
+        geqrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N, in.get(), in.strides()[1], t.get());
+    };
+    getQueue().enqueue(func, in, t, M, N);
 
     return t;
 }
 
-#define INSTANTIATE_QR(T)                                                                           \
-    template Array<T> qr_inplace<T>(Array<T> &in);                                                \
-    template void qr<T>(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
-
-INSTANTIATE_QR(float)
-INSTANTIATE_QR(cfloat)
-INSTANTIATE_QR(double)
-INSTANTIATE_QR(cdouble)
-
 }
 
 #else
@@ -130,6 +122,13 @@ Array<T> qr_inplace(Array<T> &in)
     AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
 }
 
+}
+
+#endif
+
+namespace cpu
+{
+
 #define INSTANTIATE_QR(T)                                                                           \
     template Array<T> qr_inplace<T>(Array<T> &in);                                                \
     template void qr<T>(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
@@ -140,5 +139,3 @@ INSTANTIATE_QR(double)
 INSTANTIATE_QR(cdouble)
 
 }
-
-#endif
diff --git a/src/backend/cpu/queue.hpp b/src/backend/cpu/queue.hpp
new file mode 100644
index 0000000000..2f32b4d852
--- /dev/null
+++ b/src/backend/cpu/queue.hpp
@@ -0,0 +1,93 @@
+/*******************************************************
+ * Copyright (c) 2016, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <util.hpp>
+#include <memory.hpp>
+
+//FIXME: Is there a better way to check for std::future not being supported ?
+#if defined(AF_DISABLE_CPU_ASYNC) || (defined(__GNUC__) && (__GCC_ATOMIC_INT_LOCK_FREE < 2 || __GCC_ATOMIC_POINTER_LOCK_FREE < 2))
+
+#include <functional>
+using std::function;
+#include <err_cpu.hpp>
+#define __SYNCHRONOUS_ARCH 1
+class queue_impl
+{
+public:
+    template <typename F, typename... Args>
+    void enqueue(const F func, Args... args) const {
+        AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL);
+    }
+
+    void sync() const {
+        AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL);
+    }
+
+    bool is_worker() const {
+        AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL);
+        return false;
+    }
+
+};
+
+#else
+
+#include <async_queue.hpp>
+#define __SYNCHRONOUS_ARCH 0
+typedef async_queue queue_impl;
+
+#endif
+
+#pragma once
+
+namespace cpu {
+
+/// Wraps the async_queue class
+class queue
+{
+public:
+    queue()
+        :
+        count(0),
+        sync_calls( __SYNCHRONOUS_ARCH == 1 || getEnvVar("AF_SYNCHRONOUS_CALLS") == "1")
+    {}
+
+    template <typename F, typename... Args>
+    void enqueue(const F func, Args... args)
+    {
+        count++;
+        if(sync_calls) { func( args... ); }
+        else           { aQueue.enqueue( func, args... ); }
+#ifndef NDEBUG
+        sync();
+#else
+        if (checkMemoryLimit() || count >= 25) {
+            sync();
+        }
+#endif
+    }
+
+    void sync()
+    {
+        count = 0;
+        if(!sync_calls) aQueue.sync();
+    }
+
+    bool is_worker() const
+    {
+        return (!sync_calls) ? aQueue.is_worker() : false;
+    }
+
+    private:
+        int count;
+        const bool sync_calls;
+        queue_impl aQueue;
+};
+
+}
diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp
index e93fdf9b8f..06cbca34d7 100644
--- a/src/backend/cpu/random.cpp
+++ b/src/backend/cpu/random.cpp
@@ -7,117 +7,23 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <type_traits>
-#include <random>
-#include <algorithm>
-#include <functional>
-#include <limits>
-#include <type_traits>
 #include <af/array.h>
 #include <af/dim4.hpp>
 #include <af/defines.h>
 #include <Array.hpp>
 #include <random.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/random.hpp>
 
 namespace cpu
 {
 
-using namespace std;
-
-template<typename T>
-using is_arithmetic_t       = typename enable_if< is_arithmetic<T>::value,      function<T()>>::type;
-template<typename T>
-using is_complex_t          = typename enable_if< is_complex<T>::value,         function<T()>>::type;
-template<typename T>
-using is_floating_point_t   = typename enable_if< is_floating_point<T>::value,  function<T()>>::type;
-
-template<typename T, typename GenType>
-is_arithmetic_t<T>
-urand(GenType &generator)
-{
-    typedef typename conditional<   is_floating_point<T>::value,
-                                    uniform_real_distribution<T>,
-#if OS_WIN
-                                    uniform_int_distribution<unsigned>>::type dist;
-#else
-                                    uniform_int_distribution<T >> ::type dist;
-#endif
-    return bind(dist(), generator);
-}
-
-template<typename T, typename GenType>
-is_complex_t<T>
-urand(GenType &generator)
-{
-    auto func = urand<typename T::value_type>(generator);
-    return [func] () { return T(func(), func());};
-}
-
-template<typename T, typename GenType>
-is_floating_point_t<T>
-nrand(GenType &generator)
-{
-    return bind(normal_distribution<T>(), generator);
-}
-
-template<typename T, typename GenType>
-is_complex_t<T>
-nrand(GenType &generator)
-{
-    auto func = nrand<typename T::value_type>(generator);
-    return [func] () { return T(func(), func());};
-}
-
-static mt19937 generator;
-static unsigned long long gen_seed = 0;
-static bool is_first = true;
-#define GLOBAL 1
-
-template<typename T>
-Array<T> randn(const af::dim4 &dims)
-{
-    static unsigned long long my_seed = 0;
-    if (is_first) {
-        setSeed(gen_seed);
-        my_seed = gen_seed;
-    }
-
-    static auto gen = nrand<T>(generator);
-
-    if (my_seed != gen_seed) {
-        gen = nrand<T>(generator);
-        my_seed = gen_seed;
-    }
-
-    Array<T> outArray = createEmptyArray<T>(dims);
-    T *outPtr = outArray.get();
-    for (int i = 0; i < (int)outArray.elements(); i++) {
-        outPtr[i] = gen();
-    }
-    return outArray;
-}
-
 template<typename T>
 Array<T> randu(const af::dim4 &dims)
 {
-    static unsigned long long my_seed = 0;
-    if (is_first) {
-        setSeed(gen_seed);
-        my_seed = gen_seed;
-    }
-
-    static auto gen = urand<T>(generator);
-
-    if (my_seed != gen_seed) {
-        gen = urand<T>(generator);
-        my_seed = gen_seed;
-    }
-
     Array<T> outArray = createEmptyArray<T>(dims);
-    T *outPtr = outArray.get();
-    for (int i = 0; i < (int)outArray.elements(); i++) {
-        outPtr[i] = gen();
-    }
+    getQueue().enqueue(kernel::randu<T>, outArray);
     return outArray;
 }
 
@@ -133,9 +39,18 @@ INSTANTIATE_UNIFORM(uint)
 INSTANTIATE_UNIFORM(intl)
 INSTANTIATE_UNIFORM(uintl)
 INSTANTIATE_UNIFORM(uchar)
+INSTANTIATE_UNIFORM(char)
 INSTANTIATE_UNIFORM(short)
 INSTANTIATE_UNIFORM(ushort)
 
+template<typename T>
+Array<T> randn(const af::dim4 &dims)
+{
+    Array<T> outArray = createEmptyArray<T>(dims);
+    getQueue().enqueue(kernel::randn<T>, outArray);
+    return outArray;
+}
+
 #define INSTANTIATE_NORMAL(T)                              \
     template Array<T>  randn<T>(const af::dim4 &dims);
 
@@ -144,41 +59,17 @@ INSTANTIATE_NORMAL(double)
 INSTANTIATE_NORMAL(cfloat)
 INSTANTIATE_NORMAL(cdouble)
 
-
-template<>
-Array<char> randu(const af::dim4 &dims)
-{
-    static unsigned long long my_seed = 0;
-    if (is_first) {
-        setSeed(gen_seed);
-        my_seed = gen_seed;
-    }
-
-    static auto gen = urand<float>(generator);
-
-    if (my_seed != gen_seed) {
-        gen = urand<float>(generator);
-        my_seed = gen_seed;
-    }
-
-    Array<char> outArray = createEmptyArray<char>(dims);
-    char *outPtr = outArray.get();
-    for (int i = 0; i < (int)outArray.elements(); i++) {
-        outPtr[i] = gen() > 0.5;
-    }
-    return outArray;
-}
-
 void setSeed(const uintl seed)
 {
-    generator.seed(seed);
-    is_first = false;
-    gen_seed = seed;
+    getQueue().enqueue(kernel::setSeed, seed);
 }
 
 uintl getSeed()
 {
-    return gen_seed;
+    uintl seed = 0;
+    getQueue().enqueue(kernel::getSeedPtr, &seed);
+    getQueue().sync();
+    return seed;
 }
 
 }
diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp
index eabf3a1ee1..e91ba1e241 100644
--- a/src/backend/cpu/range.cpp
+++ b/src/backend/cpu/range.cpp
@@ -14,74 +14,46 @@
 #include <err_cpu.hpp>
 #include <algorithm>
 #include <numeric>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/range.hpp>
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Kernel Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T, int dim>
-    void range(T *out, const dim4 &dims, const dim4 &strides)
-    {
-        for(dim_t w = 0; w < dims[3]; w++) {
-            dim_t offW = w * strides[3];
-            for(dim_t z = 0; z < dims[2]; z++) {
-                dim_t offWZ = offW + z * strides[2];
-                for(dim_t y = 0; y < dims[1]; y++) {
-                    dim_t offWZY = offWZ + y * strides[1];
-                    for(dim_t x = 0; x < dims[0]; x++) {
-                        dim_t id = offWZY + x;
-                        if(dim == 0) {
-                            out[id] = x;
-                        } else if(dim == 1) {
-                            out[id] = y;
-                        } else if(dim == 2) {
-                            out[id] = z;
-                        } else if(dim == 3) {
-                            out[id] = w;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Wrapper Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T>
-    Array<T> range(const dim4& dims, const int seq_dim)
-    {
-        // Set dimension along which the sequence should be
-        // Other dimensions are simply tiled
-        int _seq_dim = seq_dim;
-        if(seq_dim < 0) {
-            _seq_dim = 0;   // column wise sequence
-        }
-
-        Array<T> out = createEmptyArray<T>(dims);
-        switch(_seq_dim) {
-            case 0: range<T, 0>(out.get(), out.dims(), out.strides()); break;
-            case 1: range<T, 1>(out.get(), out.dims(), out.strides()); break;
-            case 2: range<T, 2>(out.get(), out.dims(), out.strides()); break;
-            case 3: range<T, 3>(out.get(), out.dims(), out.strides()); break;
-            default : AF_ERROR("Invalid rep selection", AF_ERR_ARG);
-        }
 
+template<typename T>
+Array<T> range(const dim4& dims, const int seq_dim)
+{
+    // Set dimension along which the sequence should be
+    // Other dimensions are simply tiled
+    int _seq_dim = seq_dim;
+    if(seq_dim < 0) {
+        _seq_dim = 0;   // column wise sequence
+    }
 
-        return out;
+    Array<T> out = createEmptyArray<T>(dims);
+    switch(_seq_dim) {
+        case 0: getQueue().enqueue(kernel::range<T, 0>, out); break;
+        case 1: getQueue().enqueue(kernel::range<T, 1>, out); break;
+        case 2: getQueue().enqueue(kernel::range<T, 2>, out); break;
+        case 3: getQueue().enqueue(kernel::range<T, 3>, out); break;
+        default : AF_ERROR("Invalid rep selection", AF_ERR_ARG);
     }
 
+    return out;
+}
+
 #define INSTANTIATE(T)                                                      \
     template Array<T> range<T>(const af::dim4 &dims, const int seq_dims);   \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(ushort)
-    INSTANTIATE(short)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(ushort)
+INSTANTIATE(short)
+
 }
diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp
index a38d06118c..2d4d18e682 100644
--- a/src/backend/cpu/reduce.cpp
+++ b/src/backend/cpu/reduce.cpp
@@ -15,6 +15,9 @@
 #include <ops.hpp>
 #include <functional>
 #include <complex>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/reduce.hpp>
 
 using af::dim4;
 
@@ -34,217 +37,179 @@ struct Binary<cdouble, af_add_t>
 
 namespace cpu
 {
-    template<af_op_t op, typename Ti, typename To, int D>
-    struct reduce_dim
-    {
-        void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
-                        const Ti *in , const dim4 &istrides, const dim4 &idims,
-                        const int dim, bool change_nan, double nanval)
-        {
-            static const int D1 = D - 1;
-            static reduce_dim<op, Ti, To, D1> reduce_dim_next;
-            for (dim_t i = 0; i < odims[D1]; i++) {
-                reduce_dim_next(out + i * ostrides[D1],
-                                ostrides, odims,
-                                in  + i * istrides[D1],
-                                istrides, idims,
-                                dim, change_nan, nanval);
-            }
-        }
-    };
 
-    template<af_op_t op, typename Ti, typename To>
-    struct reduce_dim<op, Ti, To, 0>
-    {
+template<af_op_t op, typename Ti, typename To>
+using reduce_dim_func = std::function<void(Array<To>, const dim_t,
+                                           const Array<Ti>, const dim_t,
+                                           const int, bool, double)>;
 
-        Transform<Ti, To, op> transform;
-        Binary<To, op> reduce;
-        void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
-                        const Ti *in , const dim4 &istrides, const dim4 &idims,
-                        const int dim, bool change_nan, double nanval)
-        {
-            dim_t stride = istrides[dim];
-
-            To out_val = reduce.init();
-            for (dim_t i = 0; i < idims[dim]; i++) {
-                To in_val = transform(in[i * stride]);
-                if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
-                out_val = reduce(in_val, out_val);
-            }
-
-            *out = out_val;
-        }
-    };
-
-    template<af_op_t op, typename Ti, typename To>
-    using reduce_dim_func = std::function<void(To*,const dim4&, const dim4&,
-                                                const Ti*, const dim4&, const dim4&,
-                                                const int, bool, double)>;
+template<af_op_t op, typename Ti, typename To>
+Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
+{
+    dim4 odims = in.dims();
+    odims[dim] = 1;
+    in.eval();
 
-    template<af_op_t op, typename Ti, typename To>
-    Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
-    {
-        dim4 odims = in.dims();
-        odims[dim] = 1;
+    Array<To> out = createEmptyArray<To>(odims);
+    static const reduce_dim_func<op, Ti, To>  reduce_funcs[4] = { kernel::reduce_dim<op, Ti, To, 1>()
+                                                                , kernel::reduce_dim<op, Ti, To, 2>()
+                                                                , kernel::reduce_dim<op, Ti, To, 3>()
+                                                                , kernel::reduce_dim<op, Ti, To, 4>()};
 
-        Array<To> out = createEmptyArray<To>(odims);
-        static reduce_dim_func<op, Ti, To>  reduce_funcs[4] = { reduce_dim<op, Ti, To, 1>()
-                                                              , reduce_dim<op, Ti, To, 2>()
-                                                              , reduce_dim<op, Ti, To, 3>()
-                                                              , reduce_dim<op, Ti, To, 4>()};
+    getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval);
 
-        reduce_funcs[in.ndims() - 1](out.get(), out.strides(), out.dims(),
-                                     in.get(), in.strides(), in.dims(), dim,
-                                     change_nan, nanval);
+    return out;
+}
 
-        return out;
-    }
+template<af_op_t op, typename Ti, typename To>
+To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
+{
+    in.eval();
+    getQueue().sync();
 
-    template<af_op_t op, typename Ti, typename To>
-    To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
-    {
-        Transform<Ti, To, op> transform;
-        Binary<To, op> reduce;
+    Transform<Ti, To, op> transform;
+    Binary<To, op> reduce;
 
-        To out = reduce.init();
+    To out = reduce.init();
 
-        // Decrement dimension of select dimension
-        af::dim4 dims = in.dims();
-        af::dim4 strides = in.strides();
-        const Ti *inPtr = in.get();
+    // Decrement dimension of select dimension
+    af::dim4 dims = in.dims();
+    af::dim4 strides = in.strides();
+    const Ti *inPtr = in.get();
 
-        for(dim_t l = 0; l < dims[3]; l++) {
-            dim_t off3 = l * strides[3];
+    for(dim_t l = 0; l < dims[3]; l++) {
+        dim_t off3 = l * strides[3];
 
-            for(dim_t k = 0; k < dims[2]; k++) {
-                dim_t off2 = k * strides[2];
+        for(dim_t k = 0; k < dims[2]; k++) {
+            dim_t off2 = k * strides[2];
 
-                for(dim_t j = 0; j < dims[1]; j++) {
-                    dim_t off1 = j * strides[1];
+            for(dim_t j = 0; j < dims[1]; j++) {
+                dim_t off1 = j * strides[1];
 
-                    for(dim_t i = 0; i < dims[0]; i++) {
-                        dim_t idx = i + off1 + off2 + off3;
+                for(dim_t i = 0; i < dims[0]; i++) {
+                    dim_t idx = i + off1 + off2 + off3;
 
-                        To in_val = transform(inPtr[idx]);
-                        if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
-                        out = reduce(in_val, out);
-                    }
+                    To in_val = transform(inPtr[idx]);
+                    if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
+                    out = reduce(in_val, out);
                 }
             }
         }
-
-        return out;
     }
 
+    return out;
+}
+
 #define INSTANTIATE(ROp, Ti, To)                                        \
     template Array<To> reduce<ROp, Ti, To>(const Array<Ti> &in, const int dim, \
                                            bool change_nan, double nanval); \
     template To reduce_all<ROp, Ti, To>(const Array<Ti> &in,            \
                                         bool change_nan, double nanval);
 
-    //min
-    INSTANTIATE(af_min_t, float  , float  )
-    INSTANTIATE(af_min_t, double , double )
-    INSTANTIATE(af_min_t, cfloat , cfloat )
-    INSTANTIATE(af_min_t, cdouble, cdouble)
-    INSTANTIATE(af_min_t, int    , int    )
-    INSTANTIATE(af_min_t, uint   , uint   )
-    INSTANTIATE(af_min_t, intl   , intl   )
-    INSTANTIATE(af_min_t, uintl  , uintl  )
-    INSTANTIATE(af_min_t, char   , char   )
-    INSTANTIATE(af_min_t, uchar  , uchar  )
-    INSTANTIATE(af_min_t, short  , short  )
-    INSTANTIATE(af_min_t, ushort , ushort )
-
-    //max
-    INSTANTIATE(af_max_t, float  , float  )
-    INSTANTIATE(af_max_t, double , double )
-    INSTANTIATE(af_max_t, cfloat , cfloat )
-    INSTANTIATE(af_max_t, cdouble, cdouble)
-    INSTANTIATE(af_max_t, int    , int    )
-    INSTANTIATE(af_max_t, uint   , uint   )
-    INSTANTIATE(af_max_t, intl   , intl   )
-    INSTANTIATE(af_max_t, uintl  , uintl  )
-    INSTANTIATE(af_max_t, char   , char   )
-    INSTANTIATE(af_max_t, uchar  , uchar  )
-    INSTANTIATE(af_max_t, short  , short  )
-    INSTANTIATE(af_max_t, ushort , ushort )
-
-    //sum
-    INSTANTIATE(af_add_t, float  , float  )
-    INSTANTIATE(af_add_t, double , double )
-    INSTANTIATE(af_add_t, cfloat , cfloat )
-    INSTANTIATE(af_add_t, cdouble, cdouble)
-    INSTANTIATE(af_add_t, int    , int    )
-    INSTANTIATE(af_add_t, int    , float  )
-    INSTANTIATE(af_add_t, uint   , uint   )
-    INSTANTIATE(af_add_t, uint   , float  )
-    INSTANTIATE(af_add_t, intl   , intl   )
-    INSTANTIATE(af_add_t, intl   , double )
-    INSTANTIATE(af_add_t, uintl  , uintl  )
-    INSTANTIATE(af_add_t, uintl  , double )
-    INSTANTIATE(af_add_t, char   , int    )
-    INSTANTIATE(af_add_t, char   , float  )
-    INSTANTIATE(af_add_t, uchar  , uint   )
-    INSTANTIATE(af_add_t, uchar  , float  )
-    INSTANTIATE(af_add_t, short  , int    )
-    INSTANTIATE(af_add_t, short  , float  )
-    INSTANTIATE(af_add_t, ushort , uint   )
-    INSTANTIATE(af_add_t, ushort , float  )
-
-    //mul
-    INSTANTIATE(af_mul_t, float  , float  )
-    INSTANTIATE(af_mul_t, double , double )
-    INSTANTIATE(af_mul_t, cfloat , cfloat )
-    INSTANTIATE(af_mul_t, cdouble, cdouble)
-    INSTANTIATE(af_mul_t, int    , int    )
-    INSTANTIATE(af_mul_t, uint   , uint   )
-    INSTANTIATE(af_mul_t, intl   , intl   )
-    INSTANTIATE(af_mul_t, uintl  , uintl  )
-    INSTANTIATE(af_mul_t, char   , int    )
-    INSTANTIATE(af_mul_t, uchar  , uint   )
-    INSTANTIATE(af_mul_t, short  , int    )
-    INSTANTIATE(af_mul_t, ushort , uint   )
-
-    // count
-    INSTANTIATE(af_notzero_t, float  , uint)
-    INSTANTIATE(af_notzero_t, double , uint)
-    INSTANTIATE(af_notzero_t, cfloat , uint)
-    INSTANTIATE(af_notzero_t, cdouble, uint)
-    INSTANTIATE(af_notzero_t, int    , uint)
-    INSTANTIATE(af_notzero_t, uint   , uint)
-    INSTANTIATE(af_notzero_t, intl   , uint)
-    INSTANTIATE(af_notzero_t, uintl  , uint)
-    INSTANTIATE(af_notzero_t, char   , uint)
-    INSTANTIATE(af_notzero_t, uchar  , uint)
-    INSTANTIATE(af_notzero_t, short  , uint)
-    INSTANTIATE(af_notzero_t, ushort , uint)
-
-    //anytrue
-    INSTANTIATE(af_or_t, float  , char)
-    INSTANTIATE(af_or_t, double , char)
-    INSTANTIATE(af_or_t, cfloat , char)
-    INSTANTIATE(af_or_t, cdouble, char)
-    INSTANTIATE(af_or_t, int    , char)
-    INSTANTIATE(af_or_t, uint   , char)
-    INSTANTIATE(af_or_t, intl   , char)
-    INSTANTIATE(af_or_t, uintl  , char)
-    INSTANTIATE(af_or_t, char   , char)
-    INSTANTIATE(af_or_t, uchar  , char)
-    INSTANTIATE(af_or_t, short  , char)
-    INSTANTIATE(af_or_t, ushort , char)
-
-    //alltrue
-    INSTANTIATE(af_and_t, float  , char)
-    INSTANTIATE(af_and_t, double , char)
-    INSTANTIATE(af_and_t, cfloat , char)
-    INSTANTIATE(af_and_t, cdouble, char)
-    INSTANTIATE(af_and_t, int    , char)
-    INSTANTIATE(af_and_t, uint   , char)
-    INSTANTIATE(af_and_t, intl   , char)
-    INSTANTIATE(af_and_t, uintl  , char)
-    INSTANTIATE(af_and_t, char   , char)
-    INSTANTIATE(af_and_t, uchar  , char)
-    INSTANTIATE(af_and_t, short  , char)
-    INSTANTIATE(af_and_t, ushort , char)
+//min
+INSTANTIATE(af_min_t, float  , float  )
+INSTANTIATE(af_min_t, double , double )
+INSTANTIATE(af_min_t, cfloat , cfloat )
+INSTANTIATE(af_min_t, cdouble, cdouble)
+INSTANTIATE(af_min_t, int    , int    )
+INSTANTIATE(af_min_t, uint   , uint   )
+INSTANTIATE(af_min_t, intl   , intl   )
+INSTANTIATE(af_min_t, uintl  , uintl  )
+INSTANTIATE(af_min_t, char   , char   )
+INSTANTIATE(af_min_t, uchar  , uchar  )
+INSTANTIATE(af_min_t, short  , short  )
+INSTANTIATE(af_min_t, ushort , ushort )
+
+//max
+INSTANTIATE(af_max_t, float  , float  )
+INSTANTIATE(af_max_t, double , double )
+INSTANTIATE(af_max_t, cfloat , cfloat )
+INSTANTIATE(af_max_t, cdouble, cdouble)
+INSTANTIATE(af_max_t, int    , int    )
+INSTANTIATE(af_max_t, uint   , uint   )
+INSTANTIATE(af_max_t, intl   , intl   )
+INSTANTIATE(af_max_t, uintl  , uintl  )
+INSTANTIATE(af_max_t, char   , char   )
+INSTANTIATE(af_max_t, uchar  , uchar  )
+INSTANTIATE(af_max_t, short  , short  )
+INSTANTIATE(af_max_t, ushort , ushort )
+
+//sum
+INSTANTIATE(af_add_t, float  , float  )
+INSTANTIATE(af_add_t, double , double )
+INSTANTIATE(af_add_t, cfloat , cfloat )
+INSTANTIATE(af_add_t, cdouble, cdouble)
+INSTANTIATE(af_add_t, int    , int    )
+INSTANTIATE(af_add_t, int    , float  )
+INSTANTIATE(af_add_t, uint   , uint   )
+INSTANTIATE(af_add_t, uint   , float  )
+INSTANTIATE(af_add_t, intl   , intl   )
+INSTANTIATE(af_add_t, intl   , double )
+INSTANTIATE(af_add_t, uintl  , uintl  )
+INSTANTIATE(af_add_t, uintl  , double )
+INSTANTIATE(af_add_t, char   , int    )
+INSTANTIATE(af_add_t, char   , float  )
+INSTANTIATE(af_add_t, uchar  , uint   )
+INSTANTIATE(af_add_t, uchar  , float  )
+INSTANTIATE(af_add_t, short  , int    )
+INSTANTIATE(af_add_t, short  , float  )
+INSTANTIATE(af_add_t, ushort , uint   )
+INSTANTIATE(af_add_t, ushort , float  )
+
+//mul
+INSTANTIATE(af_mul_t, float  , float  )
+INSTANTIATE(af_mul_t, double , double )
+INSTANTIATE(af_mul_t, cfloat , cfloat )
+INSTANTIATE(af_mul_t, cdouble, cdouble)
+INSTANTIATE(af_mul_t, int    , int    )
+INSTANTIATE(af_mul_t, uint   , uint   )
+INSTANTIATE(af_mul_t, intl   , intl   )
+INSTANTIATE(af_mul_t, uintl  , uintl  )
+INSTANTIATE(af_mul_t, char   , int    )
+INSTANTIATE(af_mul_t, uchar  , uint   )
+INSTANTIATE(af_mul_t, short  , int    )
+INSTANTIATE(af_mul_t, ushort , uint   )
+
+// count
+INSTANTIATE(af_notzero_t, float  , uint)
+INSTANTIATE(af_notzero_t, double , uint)
+INSTANTIATE(af_notzero_t, cfloat , uint)
+INSTANTIATE(af_notzero_t, cdouble, uint)
+INSTANTIATE(af_notzero_t, int    , uint)
+INSTANTIATE(af_notzero_t, uint   , uint)
+INSTANTIATE(af_notzero_t, intl   , uint)
+INSTANTIATE(af_notzero_t, uintl  , uint)
+INSTANTIATE(af_notzero_t, char   , uint)
+INSTANTIATE(af_notzero_t, uchar  , uint)
+INSTANTIATE(af_notzero_t, short  , uint)
+INSTANTIATE(af_notzero_t, ushort , uint)
+
+//anytrue
+INSTANTIATE(af_or_t, float  , char)
+INSTANTIATE(af_or_t, double , char)
+INSTANTIATE(af_or_t, cfloat , char)
+INSTANTIATE(af_or_t, cdouble, char)
+INSTANTIATE(af_or_t, int    , char)
+INSTANTIATE(af_or_t, uint   , char)
+INSTANTIATE(af_or_t, intl   , char)
+INSTANTIATE(af_or_t, uintl  , char)
+INSTANTIATE(af_or_t, char   , char)
+INSTANTIATE(af_or_t, uchar  , char)
+INSTANTIATE(af_or_t, short  , char)
+INSTANTIATE(af_or_t, ushort , char)
+
+//alltrue
+INSTANTIATE(af_and_t, float  , char)
+INSTANTIATE(af_and_t, double , char)
+INSTANTIATE(af_and_t, cfloat , char)
+INSTANTIATE(af_and_t, cdouble, char)
+INSTANTIATE(af_and_t, int    , char)
+INSTANTIATE(af_and_t, uint   , char)
+INSTANTIATE(af_and_t, intl   , char)
+INSTANTIATE(af_and_t, uintl  , char)
+INSTANTIATE(af_and_t, char   , char)
+INSTANTIATE(af_and_t, uchar  , char)
+INSTANTIATE(af_and_t, short  , char)
+INSTANTIATE(af_and_t, ushort , char)
+
 }
diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp
index b753fb5547..2384dd3341 100644
--- a/src/backend/cpu/regions.cpp
+++ b/src/backend/cpu/regions.cpp
@@ -17,186 +17,24 @@
 #include <map>
 #include <set>
 #include <algorithm>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/regions.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<typename T>
-class LabelNode
-{
-private:
-    T label;
-    T minLabel;
-    unsigned rank;
-    LabelNode* parent;
-
-public:
-    LabelNode() : label(0), minLabel(0), rank(0), parent(this) { }
-    LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { }
-
-    T getLabel()
-    {
-        return label;
-    }
-
-    T getMinLabel()
-    {
-        return minLabel;
-    }
-
-    LabelNode* getParent()
-    {
-        return parent;
-    }
-
-    unsigned getRank()
-    {
-        return rank;
-    }
-
-    void setMinLabel(T l)
-    {
-        minLabel = l;
-    }
-
-    void setParent(LabelNode* p)
-    {
-        parent = p;
-    }
-
-    void setRank(unsigned r)
-    {
-        rank = r;
-    }
-};
-
-template<typename T>
-static LabelNode<T>* find(LabelNode<T>* x)
-{
-    if (x->getParent() != x)
-        x->setParent(find(x->getParent()));
-    return x->getParent();
-}
-
-template<typename T>
-static void setUnion(LabelNode<T>* x, LabelNode<T>* y)
-{
-    LabelNode<T>* xRoot = find(x);
-    LabelNode<T>* yRoot = find(y);
-    if (xRoot == yRoot)
-        return;
-
-    T xMinLabel = xRoot->getMinLabel();
-    T yMinLabel = yRoot->getMinLabel();
-    xRoot->setMinLabel(min(xMinLabel, yMinLabel));
-    yRoot->setMinLabel(min(xMinLabel, yMinLabel));
-
-    if (xRoot->getRank() < yRoot->getRank())
-        xRoot->setParent(yRoot);
-    else if (xRoot->getRank() > yRoot->getRank())
-        yRoot->setParent(xRoot);
-    else {
-        yRoot->setParent(xRoot);
-        xRoot->setRank(xRoot->getRank() + 1);
-    }
-}
-
 template<typename T>
 Array<T> regions(const Array<char> &in, af_connectivity connectivity)
 {
-    const dim4 in_dims = in.dims();
-
-    // Create output placeholder
-    Array<T> out = createValueArray(in_dims, (T)0);
-
-    const char *in_ptr  = in.get();
-          T    *out_ptr = out.get();
-
-    // Map labels
-    typedef typename std::map<T, LabelNode<T>* > label_map_t;
-    typedef typename label_map_t::iterator label_map_iterator_t;
-
-    label_map_t lmap;
-
-    // Initial label
-    T label = (T)1;
-
-    for (int j = 0; j < (int)in_dims[1]; j++) {
-        for (int i = 0; i < (int)in_dims[0]; i++) {
-            int idx = j * in_dims[0] + i;
-            if (in_ptr[idx] != 0) {
-                std::vector<T> l;
-
-                // Test neighbors
-                if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0)
-                    l.push_back(out_ptr[j * in_dims[0] + i-1]);
-                if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0)
-                    l.push_back(out_ptr[(j-1) * in_dims[0] + i]);
-                if (connectivity == AF_CONNECTIVITY_8 && i > 0 && j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0)
-                    l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]);
-                if (connectivity == AF_CONNECTIVITY_8 && i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0)
-                    l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]);
-
-                if (!l.empty()) {
-                    T minl = l[0];
-                    for (size_t k = 0; k < l.size(); k++) {
-                        minl = min(l[k], minl);
-                        label_map_iterator_t cur_map = lmap.find(l[k]);
-                        LabelNode<T> *node = cur_map->second;
-                        // Group labels of the same region under a disjoint set
-                        for (size_t m = k+1; m < l.size(); m++)
-                            setUnion(node, lmap.find(l[m])->second);
-                    }
-                    // Set label to smallest neighbor label
-                    out_ptr[idx] = minl;
-                }
-                else {
-                    // Insert new label in map
-                    LabelNode<T> *node = new LabelNode<T>(label);
-                    lmap.insert(std::pair<T, LabelNode<T>* >(label, node));
-                    out_ptr[idx] = label++;
-                }
-            }
-        }
-    }
-
-    std::set<T> removed;
-
-    for (int j = 0; j < (int)in_dims[1]; j++) {
-        for (int i = 0; i < (int)in_dims[0]; i++) {
-            int idx = j * (int)in_dims[0] + i;
-            if (in_ptr[idx] != 0) {
-                T l = out_ptr[idx];
-                label_map_iterator_t cur_map = lmap.find(l);
-
-                if (cur_map != lmap.end()) {
-                    LabelNode<T>* node = cur_map->second;
-
-                    LabelNode<T>* node_root = find(node);
-                    out_ptr[idx] = node_root->getMinLabel();
+    in.eval();
 
-                    // Mark removed labels (those that are part of a region
-                    // that contains a smaller label)
-                    if (node->getMinLabel() < l || node_root->getMinLabel() < l)
-                        removed.insert(l);
-                    if (node->getLabel() > node->getMinLabel())
-                        removed.insert(node->getLabel());
-                }
-            }
-        }
-    }
+    Array<T> out = createValueArray(in.dims(), (T)0);
+    out.eval();
 
-    // Calculate final neighbors (ensure final labels are sequential)
-    for (int j = 0; j < (int)in_dims[1]; j++) {
-        for (int i = 0; i < (int)in_dims[0]; i++) {
-            int idx = j * (int)in_dims[0] + i;
-            if (out_ptr[idx] > 0) {
-                out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx]));
-            }
-        }
-    }
+    getQueue().enqueue(kernel::regions<T>, out, in, connectivity);
 
     return out;
 }
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp
index a9824a4444..bd156585ee 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/reorder.cpp
@@ -9,69 +9,42 @@
 
 #include <Array.hpp>
 #include <reorder.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/reorder.hpp>
 
 namespace cpu
 {
-    template<typename T>
-    Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
-    {
-        const af::dim4 iDims = in.dims();
-        af::dim4 oDims(0);
-        for(int i = 0; i < 4; i++)
-            oDims[i] = iDims[rdims[i]];
 
-        Array<T> out = createEmptyArray<T>(oDims);
-
-        T* outPtr = out.get();
-        const T* inPtr = in.get();
-
-        const af::dim4 ist = in.strides();
-        const af::dim4 ost = out.strides();
-
-
-        dim_t ids[4]  = {0};
-        for(dim_t ow = 0; ow < oDims[3]; ow++) {
-            const dim_t oW = ow * ost[3];
-            ids[rdims[3]] = ow;
-            for(dim_t oz = 0; oz < oDims[2]; oz++) {
-                const dim_t oZW = oW + oz * ost[2];
-                ids[rdims[2]] = oz;
-                for(dim_t oy = 0; oy < oDims[1]; oy++) {
-                    const dim_t oYZW = oZW + oy * ost[1];
-                    ids[rdims[1]] = oy;
-                    for(dim_t ox = 0; ox < oDims[0]; ox++) {
-                        const dim_t oIdx = oYZW + ox;
-
-                        ids[rdims[0]] = ox;
-                        const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] +
-                                              ids[1] * ist[1] + ids[0];
+template<typename T>
+Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
+{
+    in.eval();
 
-                        outPtr[oIdx] = inPtr[iIdx];
-                    }
-                }
-            }
-        }
+    const af::dim4 iDims = in.dims();
+    af::dim4 oDims(0);
+    for(int i = 0; i < 4; i++)
+        oDims[i] = iDims[rdims[i]];
 
-        return out;
-    }
+    Array<T> out = createEmptyArray<T>(oDims);
+    getQueue().enqueue(kernel::reorder<T>, out, in, oDims, rdims);
+    return out;
+}
 
 #define INSTANTIATE(T)                                                         \
     template Array<T> reorder<T>(const Array<T> &in, const af::dim4 &rdims);  \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
-
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp
index 8c4da58934..eaeb5d4e3d 100644
--- a/src/backend/cpu/resize.cpp
+++ b/src/backend/cpu/resize.cpp
@@ -9,214 +9,54 @@
 
 #include <Array.hpp>
 #include <resize.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <math.hpp>
 #include <types.hpp>
 #include <af/traits.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/resize.hpp>
 
 namespace cpu
 {
-    /**
-     * noop function for round to avoid compilation
-     * issues due to lack of this function in C90 based
-     * compilers, it is only present in C99 and C++11
-     *
-     * This is not a full fledged implementation, this function
-     * is to be used only for positive numbers, i m using it here
-     * for calculating dimensions of arrays
-     */
-    dim_t round2int(float value)
-    {
-        return (dim_t)(value+0.5f);
-    }
-
-    using std::conditional;
-    using std::is_same;
-
-    template<typename T>
-    using wtype_t = typename conditional<is_same<T, double>::value, double, float>::type;
-
-    template<typename T>
-    using vtype_t = typename conditional<is_complex<T>::value,
-                                         T, wtype_t<T>
-                                        >::type;
-
-    template<typename T, af_interp_type method>
-    struct resize_op
-    {
-        void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                  const dim_t x, const dim_t y)
-        {
-            return;
-        }
-    };
-
-    template<typename T>
-    struct resize_op<T, AF_INTERP_NEAREST>
-    {
-        void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-                const af::dim4 &ostrides, const af::dim4 &istrides,
-                const dim_t x, const dim_t y)
-        {
-            // Compute Indices
-            dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0]));
-            dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1]));
-
-            if (i_x >= idims[0]) i_x = idims[0] - 1;
-            if (i_y >= idims[1]) i_y = idims[1] - 1;
-
-            dim_t i_off = i_y * istrides[1] + i_x;
-            dim_t o_off =   y * ostrides[1] + x;
-            // Copy values from all channels
-            for(dim_t w = 0; w < odims[3]; w++) {
-                dim_t wost = w * ostrides[3];
-                dim_t wist = w * istrides[3];
-                for(dim_t z = 0; z < odims[2]; z++) {
-                    outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist];
-                }
-            }
-        }
-    };
-
-    template<typename T>
-    struct resize_op<T, AF_INTERP_BILINEAR>
-    {
-        void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-                const af::dim4 &ostrides, const af::dim4 &istrides,
-                const dim_t x, const dim_t y)
-        {
-            // Compute Indices
-            float f_x = (float)x / (odims[0] / (float)idims[0]);
-            float f_y = (float)y / (odims[1] / (float)idims[1]);
-
-            dim_t i1_x  = floor(f_x);
-            dim_t i1_y  = floor(f_y);
-
-            if (i1_x >= idims[0]) i1_x = idims[0] - 1;
-            if (i1_y >= idims[1]) i1_y = idims[1] - 1;
-
-            float b   = f_x - i1_x;
-            float a   = f_y - i1_y;
-
-            dim_t i2_x  = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1);
-            dim_t i2_y  = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1);
-
-            typedef typename dtype_traits<T>::base_type BT;
-            typedef wtype_t<BT> WT;
-            typedef vtype_t<T> VT;
 
-            dim_t o_off = y * ostrides[1] + x;
-            // Copy values from all channels
-            for(dim_t w = 0; w < odims[3]; w++) {
-                dim_t wst = w * istrides[3];
-                for(dim_t z = 0; z < odims[2]; z++) {
-                    dim_t zst = z * istrides[2];
-                    dim_t channel_off = zst + wst;
-                    VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off];
-                    VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off];
-                    VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off];
-                    VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off];
-
-                    outPtr[o_off + z * ostrides[2] + w * ostrides[3]] =
-                                    scalar<WT>((1.0f - a) * (1.0f - b)) * p1 +
-                                    scalar<WT>((    a   ) * (1.0f - b)) * p2 +
-                                    scalar<WT>((1.0f - a) * (    b   )) * p3 +
-                                    scalar<WT>((    a   ) * (    b   )) * p4;
-                }
-            }
-        }
-    };
-
-    template<typename T>
-    struct resize_op<T, AF_INTERP_LOWER>
-    {
-        void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-                const af::dim4 &ostrides, const af::dim4 &istrides,
-                const dim_t x, const dim_t y)
-        {
-            // Compute Indices
-            dim_t i_x = floor((float)x / (odims[0] / (float)idims[0]));
-            dim_t i_y = floor((float)y / (odims[1] / (float)idims[1]));
-
-            if (i_x >= idims[0]) i_x = idims[0] - 1;
-            if (i_y >= idims[1]) i_y = idims[1] - 1;
-
-            dim_t i_off = i_y * istrides[1] + i_x;
-            dim_t o_off =   y * ostrides[1] + x;
-            // Copy values from all channels
-            for(dim_t w = 0; w < odims[3]; w++) {
-                dim_t wost = w * ostrides[3];
-                dim_t wist = w * istrides[3];
-                for(dim_t z = 0; z < odims[2]; z++) {
-                    outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist];
-                }
-            }
-        }
-    };
-
-    template<typename T, af_interp_type method>
-    void resize_(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-                 const af::dim4 &ostrides, const af::dim4 &istrides)
-    {
-        resize_op<T, method> op;
-        for(dim_t y = 0; y < odims[1]; y++) {
-            for(dim_t x = 0; x < odims[0]; x++) {
-                op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y);
-            }
-        }
-    }
-
-    template<typename T>
-    Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1,
-                    const af_interp_type method)
-    {
-        af::dim4 idims = in.dims();
-        af::dim4 odims(odim0, odim1, idims[2], idims[3]);
-
-        // Create output placeholder
-        Array<T> outArray = createValueArray(odims, (T)0);
-
-        // Get pointers to raw data
-        const T *inPtr = in.get();
-              T *outPtr = outArray.get();
-
-        af::dim4 ostrides = outArray.strides();
-        af::dim4 istrides = in.strides();
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                resize_<T, AF_INTERP_NEAREST>(outPtr, inPtr, odims, idims, ostrides, istrides);
-                break;
-            case AF_INTERP_BILINEAR:
-                resize_<T, AF_INTERP_BILINEAR>(outPtr, inPtr, odims, idims, ostrides, istrides);
-                break;
-            case AF_INTERP_LOWER:
-                resize_<T, AF_INTERP_LOWER>(outPtr, inPtr, odims, idims, ostrides, istrides);
-                break;
-            default:
-                break;
-        }
-        return outArray;
+template<typename T>
+Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1,
+                const af_interp_type method)
+{
+    af::dim4 idims = in.dims();
+    af::dim4 odims(odim0, odim1, idims[2], idims[3]);
+    // Create output placeholder
+    Array<T> out = createValueArray(odims, (T)0);
+    out.eval();
+    in.eval();
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            getQueue().enqueue(kernel::resize<T, AF_INTERP_NEAREST>, out, in); break;
+        case AF_INTERP_BILINEAR:
+            getQueue().enqueue(kernel::resize<T, AF_INTERP_BILINEAR>, out, in); break;
+        case AF_INTERP_LOWER:
+            getQueue().enqueue(kernel::resize<T, AF_INTERP_LOWER>, out, in); break;
+        default: break;
     }
+    return out;
+}
 
-
-#define INSTANTIATE(T)                                                                            \
+#define INSTANTIATE(T)                                                                     \
     template Array<T> resize<T> (const Array<T> &in, const dim_t odim0, const dim_t odim1, \
                                  const af_interp_type method);
 
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp
index a4af64b669..0fb9b17674 100644
--- a/src/backend/cpu/rotate.cpp
+++ b/src/backend/cpu/rotate.cpp
@@ -9,112 +9,56 @@
 
 #include <Array.hpp>
 #include <rotate.hpp>
-#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 #include "transform_interp.hpp"
+#include <kernel/rotate.hpp>
 
 namespace cpu
 {
-    template<typename T, af_interp_type method>
-    void rotate_(T *out, const T *in, const float theta,
-                 const af::dim4 &odims, const af::dim4 &idims,
-                 const af::dim4 &ostrides, const af::dim4 &istrides)
-    {
-        dim_t nimages = idims[2];
 
-        void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
-                     const af::dim4 &, const af::dim4 &,
-                     const dim_t, const dim_t, const dim_t, const dim_t);
-
-        const float c = cos(-theta), s = sin(-theta);
-        float tx, ty;
-        {
-            const float nx = 0.5 * (idims[0] - 1);
-            const float ny = 0.5 * (idims[1] - 1);
-            const float mx = 0.5 * (odims[0] - 1);
-            const float my = 0.5 * (odims[1] - 1);
-            const float sx = (mx * c + my *-s);
-            const float sy = (mx * s + my * c);
-            tx = -(sx - nx);
-            ty = -(sy - ny);
-        }
-
-        const float tmat[6] = {std::round( c * 1000) / 1000.0f,
-                               std::round(-s * 1000) / 1000.0f,
-                               std::round(tx * 1000) / 1000.0f,
-                               std::round( s * 1000) / 1000.0f,
-                               std::round( c * 1000) / 1000.0f,
-                               std::round(ty * 1000) / 1000.0f,
-                              };
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                t_fn = &transform_n;
-                break;
-            case AF_INTERP_BILINEAR:
-                t_fn = &transform_b;
-                break;
-            case AF_INTERP_LOWER:
-                t_fn = &transform_l;
-                break;
-            default:
-                AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-                break;
-        }
+template<typename T>
+Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
+                 const af_interp_type method)
+{
+    in.eval();
 
+    Array<T> out = createEmptyArray<T>(odims);
 
-        // Do transform for image
-        for(int yy = 0; yy < (int)odims[1]; yy++) {
-            for(int xx = 0; xx < (int)odims[0]; xx++) {
-                t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy);
-            }
-        }
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            getQueue().enqueue(kernel::rotate<T, AF_INTERP_NEAREST>, out, in, theta);
+            break;
+        case AF_INTERP_BILINEAR:
+            getQueue().enqueue(kernel::rotate<T, AF_INTERP_BILINEAR>, out, in, theta);
+            break;
+        case AF_INTERP_LOWER:
+            getQueue().enqueue(kernel::rotate<T, AF_INTERP_LOWER>, out, in, theta);
+            break;
+        default:
+            AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+            break;
     }
 
-    template<typename T>
-    Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
-                     const af_interp_type method)
-    {
-        Array<T> out = createEmptyArray<T>(odims);
-        const af::dim4 idims = in.dims();
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                rotate_<T, AF_INTERP_NEAREST>
-                       (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides());
-                break;
-            case AF_INTERP_BILINEAR:
-                rotate_<T, AF_INTERP_BILINEAR>
-                       (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides());
-                break;
-            case AF_INTERP_LOWER:
-                rotate_<T, AF_INTERP_LOWER>
-                       (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides());
-                break;
-            default:
-                AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-                break;
-        }
-
-        return out;
-    }
+    return out;
+}
 
 
 #define INSTANTIATE(T)                                                              \
     template Array<T> rotate(const Array<T> &in, const float theta,                 \
                              const af::dim4 &odims, const af_interp_type method);
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp
index 2bdda210a2..78de4142c8 100644
--- a/src/backend/cpu/scan.cpp
+++ b/src/backend/cpu/scan.cpp
@@ -14,102 +14,60 @@
 #include <Array.hpp>
 #include <scan.hpp>
 #include <ops.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/scan.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<af_op_t op, typename Ti, typename To, int D>
-    struct scan_dim
-    {
-        void operator()(To *out, const dim4 ostrides, const dim4 odims,
-                        const Ti *in , const dim4 istrides, const dim4 idims,
-                        const int dim)
-        {
-            const int D1 = D - 1;
-            for (dim_t i = 0; i < odims[D1]; i++) {
-                scan_dim<op, Ti, To, D1>()(out + i * ostrides[D1],
-                                           ostrides, odims,
-                                           in  + i * istrides[D1],
-                                           istrides, idims,
-                                           dim);
-                if (D1 == dim) break;
-            }
-        }
-    };
 
-    template<af_op_t op, typename Ti, typename To>
-    struct scan_dim<op, Ti, To, 0>
-    {
-        void operator()(To *out, const dim4 ostrides, const dim4 odims,
-                        const Ti *in , const dim4 istrides, const dim4 idims,
-                        const int dim)
-        {
-
-            dim_t istride = istrides[dim];
-            dim_t ostride = ostrides[dim];
-
-            Transform<Ti, To, op> transform;
-            // FIXME: Change the name to something better
-            Binary<To, op> scan;
-
-            To out_val = scan.init();
-            for (dim_t i = 0; i < idims[dim]; i++) {
-                To in_val = transform(in[i * istride]);
-                out_val = scan(in_val, out_val);
-                out[i * ostride] = out_val;
-            }
-        }
-    };
-
-    template<af_op_t op, typename Ti, typename To>
-    Array<To> scan(const Array<Ti>& in, const int dim)
-    {
-        dim4 dims = in.dims();
-
-        Array<To> out = createValueArray<To>(dims, 0);
+template<af_op_t op, typename Ti, typename To>
+Array<To> scan(const Array<Ti>& in, const int dim)
+{
+    dim4 dims     = in.dims();
+    Array<To> out = createEmptyArray<To>(dims);
+    in.eval();
 
-        switch (in.ndims()) {
+    switch (in.ndims()) {
         case 1:
-            scan_dim<op, Ti, To, 1>()(out.get(), out.strides(), out.dims(),
-                                      in.get(), in.strides(), in.dims(), dim);
+            kernel::scan_dim<op, Ti, To, 1> func1;
+            getQueue().enqueue(func1, out, 0, in, 0, dim);
             break;
-
         case 2:
-            scan_dim<op, Ti, To, 2>()(out.get(), out.strides(), out.dims(),
-                                      in.get(), in.strides(), in.dims(), dim);
+            kernel::scan_dim<op, Ti, To, 2> func2;
+            getQueue().enqueue(func2, out, 0, in, 0, dim);
             break;
-
         case 3:
-            scan_dim<op, Ti, To, 3>()(out.get(), out.strides(), out.dims(),
-                                      in.get(), in.strides(), in.dims(), dim);
+            kernel::scan_dim<op, Ti, To, 3> func3;
+            getQueue().enqueue(func3, out, 0, in, 0, dim);
             break;
-
         case 4:
-            scan_dim<op, Ti, To, 4>()(out.get(), out.strides(), out.dims(),
-                                      in.get(), in.strides(), in.dims(), dim);
+            kernel::scan_dim<op, Ti, To, 4> func4;
+            getQueue().enqueue(func4, out, 0, in, 0, dim);
             break;
-        }
-
-        return out;
     }
 
+    return out;
+}
+
 #define INSTANTIATE(ROp, Ti, To)                                        \
     template Array<To> scan<ROp, Ti, To>(const Array<Ti> &in, const int dim); \
 
-    //accum
-    INSTANTIATE(af_add_t, float  , float  )
-    INSTANTIATE(af_add_t, double , double )
-    INSTANTIATE(af_add_t, cfloat , cfloat )
-    INSTANTIATE(af_add_t, cdouble, cdouble)
-    INSTANTIATE(af_add_t, int    , int    )
-    INSTANTIATE(af_add_t, uint   , uint   )
-    INSTANTIATE(af_add_t, intl   , intl   )
-    INSTANTIATE(af_add_t, uintl  , uintl  )
-    INSTANTIATE(af_add_t, char   , int    )
-    INSTANTIATE(af_add_t, uchar  , uint   )
-    INSTANTIATE(af_add_t, short  , int    )
-    INSTANTIATE(af_add_t, ushort , uint   )
-    INSTANTIATE(af_notzero_t, char  , uint   )
+//accum
+INSTANTIATE(af_add_t, float  , float  )
+INSTANTIATE(af_add_t, double , double )
+INSTANTIATE(af_add_t, cfloat , cfloat )
+INSTANTIATE(af_add_t, cdouble, cdouble)
+INSTANTIATE(af_add_t, int    , int    )
+INSTANTIATE(af_add_t, uint   , uint   )
+INSTANTIATE(af_add_t, intl   , intl   )
+INSTANTIATE(af_add_t, uintl  , uintl  )
+INSTANTIATE(af_add_t, char   , int    )
+INSTANTIATE(af_add_t, uchar  , uint   )
+INSTANTIATE(af_add_t, short  , int    )
+INSTANTIATE(af_add_t, ushort , uint   )
+INSTANTIATE(af_notzero_t, char  , uint)
 
 }
diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp
index 7b2cc81735..1545a81f46 100644
--- a/src/backend/cpu/select.cpp
+++ b/src/backend/cpu/select.cpp
@@ -6,117 +6,37 @@
  * The complete license agreement can be obtained at:
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
+
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <select.hpp>
-#include <err_cpu.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/select.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<typename T>
-    void select(Array<T> &out, const Array<char> &cond, const Array<T> &a, const Array<T> &b)
-    {
-        dim4 adims = a.dims();
-        dim4 astrides = a.strides();
-        dim4 bdims = b.dims();
-        dim4 bstrides = b.strides();
-
-        dim4 cdims = cond.dims();
-        dim4 cstrides = cond.strides();
-
-        dim4 odims = out.dims();
-        dim4 ostrides = out.strides();
-
-        bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1],
-                            adims[2] == odims[2], adims[3] == odims[3]};
-
-        bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1],
-                            bdims[2] == odims[2], bdims[3] == odims[3]};
-
-        bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1],
-                            cdims[2] == odims[2], cdims[3] == odims[3]};
-
-        const T *aptr = a.get();
-        const T *bptr = b.get();
-        T *optr = out.get();
-        const char *cptr = cond.get();
-
-        for (int l = 0; l < odims[3]; l++) {
-
-            int o_off3   = ostrides[3] * l;
-            int a_off3   = astrides[3] * is_a_same[3] * l;
-            int b_off3   = bstrides[3] * is_b_same[3] * l;
-            int c_off3   = cstrides[3] * is_c_same[3] * l;
-
-            for (int k = 0; k < odims[2]; k++) {
-
-                int o_off2   = ostrides[2] * k + o_off3;
-                int a_off2   = astrides[2] * is_a_same[2] * k + a_off3;
-                int b_off2   = bstrides[2] * is_b_same[2] * k + b_off3;
-                int c_off2   = cstrides[2] * is_c_same[2] * k + c_off3;
-
-                for (int j = 0; j < odims[1]; j++) {
-
-                    int o_off1   = ostrides[1] * j + o_off2;
-                    int a_off1   = astrides[1] * is_a_same[1] * j + a_off2;
-                    int b_off1   = bstrides[1] * is_b_same[1] * j + b_off2;
-                    int c_off1   = cstrides[1] * is_c_same[1] * j + c_off2;
-
-                    for (int i = 0; i < odims[0]; i++) {
-
-                        bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1];
-                        T    aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1];
-                        T    bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1];
-                        T    oval = cval ? aval : bval;
-                        optr[o_off1 + i] = oval;
-                    }
-                }
-            }
-        }
-    }
 
-    template<typename T, bool flip>
-    void select_scalar(Array<T> &out, const Array<char> &cond, const Array<T> &a, const double &b)
-    {
-        dim4 astrides = a.strides();
-        dim4 cstrides = cond.strides();
-
-        dim4 odims = out.dims();
-        dim4 ostrides = out.strides();
-
-        const T *aptr = a.get();
-        T *optr = out.get();
-        const char *cptr = cond.get();
-
-        for (int l = 0; l < odims[3]; l++) {
-
-            int o_off3 = ostrides[3] * l;
-            int a_off3 = astrides[3] * l;
-            int c_off3 = cstrides[3] * l;
-
-            for (int k = 0; k < odims[2]; k++) {
-
-                int o_off2 = ostrides[2] * k + o_off3;
-                int a_off2 = astrides[2] * k + a_off3;
-                int c_off2 = cstrides[2] * k + c_off3;
-
-                for (int j = 0; j < odims[1]; j++) {
-
-                    int o_off1 = ostrides[1] * j + o_off2;
-                    int a_off1 = astrides[1] * j + a_off2;
-                    int c_off1 = cstrides[1] * j + c_off2;
-
-                    for (int i = 0; i < odims[0]; i++) {
-
-                        optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b;
-                    }
-                }
-            }
-        }
-    }
+template<typename T>
+void select(Array<T> &out, const Array<char> &cond, const Array<T> &a, const Array<T> &b)
+{
+    out.eval();
+    cond.eval();
+    a.eval();
+    b.eval();
+    getQueue().enqueue(kernel::select<T>, out, cond, a, b);
+}
 
+template<typename T, bool flip>
+void select_scalar(Array<T> &out, const Array<char> &cond, const Array<T> &a, const double &b)
+{
+    out.eval();
+    cond.eval();
+    a.eval();
+    getQueue().enqueue(kernel::select_scalar<T, flip>, out, cond, a, b);
+}
 
 #define INSTANTIATE(T)                                              \
     template void select<T>(Array<T> &out, const Array<char> &cond, \
@@ -130,16 +50,17 @@ namespace cpu
                                           const Array<T> &a,        \
                                           const double &b);         \
 
-    INSTANTIATE(float  )
-    INSTANTIATE(double )
-    INSTANTIATE(cfloat )
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int    )
-    INSTANTIATE(uint   )
-    INSTANTIATE(intl   )
-    INSTANTIATE(uintl  )
-    INSTANTIATE(char   )
-    INSTANTIATE(uchar  )
-    INSTANTIATE(short  )
-    INSTANTIATE(ushort )
+INSTANTIATE(float  )
+INSTANTIATE(double )
+INSTANTIATE(cfloat )
+INSTANTIATE(cdouble)
+INSTANTIATE(int    )
+INSTANTIATE(uint   )
+INSTANTIATE(intl   )
+INSTANTIATE(uintl  )
+INSTANTIATE(char   )
+INSTANTIATE(uchar  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
+
 }
diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp
index 3215e6d5c2..d6c2a611e0 100644
--- a/src/backend/cpu/set.cpp
+++ b/src/backend/cpu/set.cpp
@@ -18,105 +18,123 @@
 #include <sort.hpp>
 #include <err_cpu.hpp>
 #include <vector>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
-    using namespace std;
-    using af::dim4;
-
-    template<typename T>
-    Array<T> setUnique(const Array<T> &in,
-                        const bool is_sorted)
-    {
-        Array<T> out = createEmptyArray<T>(af::dim4());
-        if (is_sorted) out = copyArray<T>(in);
-        else           out = sort<T, 1>(in, 0);
-
-        T *ptr = out.get();
-        T *last = std::unique(ptr, ptr + in.elements());
-        dim_t dist = (dim_t)std::distance(ptr, last);
-
-        dim4 dims(dist, 1, 1, 1);
-        out.resetDims(dims);
-        return out;
-    }
 
-    template<typename T>
-    Array<T> setUnion(const Array<T> &first,
-                       const Array<T> &second,
-                       const bool is_unique)
-    {
-        Array<T> uFirst = first;
-        Array<T> uSecond = second;
+using namespace std;
+using af::dim4;
+
+template<typename T>
+Array<T> setUnique(const Array<T> &in,
+                    const bool is_sorted)
+{
+    in.eval();
+
+    Array<T> out = createEmptyArray<T>(af::dim4());
+    if (is_sorted) out = copyArray<T>(in);
+    else           out = sort<T, 1>(in, 0);
 
-        if (!is_unique) {
-            // FIXME: Perhaps copy + unique would do ?
-            uFirst  = setUnique(first, false);
-            uSecond = setUnique(second, false);
-        }
+    // Need to sync old jobs since we need to
+    // operator on pointers directly in std::unique
+    getQueue().sync();
 
-        dim_t first_elements  = uFirst.elements();
-        dim_t second_elements = uSecond.elements();
-        dim_t elements = first_elements + second_elements;
+    T *ptr = out.get();
+    T *last = std::unique(ptr, ptr + in.elements());
+    dim_t dist = (dim_t)std::distance(ptr, last);
 
-        Array<T> out = createEmptyArray<T>(af::dim4(elements));
+    dim4 dims(dist, 1, 1, 1);
+    out.resetDims(dims);
+    return out;
+}
 
-        T *ptr = out.get();
-        T *last = std::set_union(uFirst.get() , uFirst.get()  + first_elements,
-                                 uSecond.get(), uSecond.get() + second_elements,
-                                 ptr);
+template<typename T>
+Array<T> setUnion(const Array<T> &first,
+                   const Array<T> &second,
+                   const bool is_unique)
+{
+    first.eval();
+    second.eval();
+    getQueue().sync();
 
-        dim_t dist = (dim_t)std::distance(ptr, last);
-        dim4 dims(dist, 1, 1, 1);
-        out.resetDims(dims);
+    Array<T> uFirst = first;
+    Array<T> uSecond = second;
 
-        return out;
+    if (!is_unique) {
+        // FIXME: Perhaps copy + unique would do ?
+        uFirst  = setUnique(first, false);
+        uSecond = setUnique(second, false);
     }
 
-    template<typename T>
-    Array<T> setIntersect(const Array<T> &first,
-                          const Array<T> &second,
-                          const bool is_unique)
-    {
-        Array<T> uFirst = first;
-        Array<T> uSecond = second;
+    dim_t first_elements  = uFirst.elements();
+    dim_t second_elements = uSecond.elements();
+    dim_t elements = first_elements + second_elements;
 
-        if (!is_unique) {
-            uFirst  = setUnique(first, false);
-            uSecond = setUnique(second, false);
-        }
+    Array<T> out = createEmptyArray<T>(af::dim4(elements));
 
-        dim_t first_elements  = uFirst.elements();
-        dim_t second_elements = uSecond.elements();
-        dim_t elements = std::max(first_elements, second_elements);
+    T *ptr = out.get();
+    T *last = std::set_union(uFirst.get() , uFirst.get()  + first_elements,
+                             uSecond.get(), uSecond.get() + second_elements,
+                             ptr);
 
-        Array<T> out = createEmptyArray<T>(af::dim4(elements));
+    dim_t dist = (dim_t)std::distance(ptr, last);
+    dim4 dims(dist, 1, 1, 1);
+    out.resetDims(dims);
+
+    return out;
+}
 
-        T *ptr = out.get();
-        T *last = std::set_intersection(uFirst.get() , uFirst.get()  + first_elements,
-                                        uSecond.get(), uSecond.get() + second_elements,
-                                        ptr);
+template<typename T>
+Array<T> setIntersect(const Array<T> &first,
+                      const Array<T> &second,
+                      const bool is_unique)
+{
+    first.eval();
+    second.eval();
+    getQueue().sync();
 
-        dim_t dist = (dim_t)std::distance(ptr, last);
-        dim4 dims(dist, 1, 1, 1);
-        out.resetDims(dims);
+    Array<T> uFirst = first;
+    Array<T> uSecond = second;
 
-        return out;
+    if (!is_unique) {
+        uFirst  = setUnique(first, false);
+        uSecond = setUnique(second, false);
     }
 
+    dim_t first_elements  = uFirst.elements();
+    dim_t second_elements = uSecond.elements();
+    dim_t elements = std::max(first_elements, second_elements);
+
+    Array<T> out = createEmptyArray<T>(af::dim4(elements));
+
+    T *ptr = out.get();
+    T *last = std::set_intersection(uFirst.get() , uFirst.get()  + first_elements,
+                                    uSecond.get(), uSecond.get() + second_elements,
+                                    ptr);
+
+    dim_t dist = (dim_t)std::distance(ptr, last);
+    dim4 dims(dist, 1, 1, 1);
+    out.resetDims(dims);
+
+    return out;
+}
+
 #define INSTANTIATE(T)                                                  \
     template Array<T> setUnique<T>(const Array<T> &in, const bool is_sorted); \
     template Array<T> setUnion<T>(const Array<T> &first, const Array<T> &second, const bool is_unique); \
     template Array<T> setIntersect<T>(const Array<T> &first, const Array<T> &second, const bool is_unique); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
 }
diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp
index 05cac4c678..041f1ab8ba 100644
--- a/src/backend/cpu/shift.cpp
+++ b/src/backend/cpu/shift.cpp
@@ -9,80 +9,40 @@
 
 #include <Array.hpp>
 #include <shift.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <cassert>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/shift.hpp>
 
 namespace cpu
 {
-    static inline dim_t simple_mod(const dim_t i, const dim_t dim)
-    {
-        return (i < dim) ? i : (i - dim);
-    }
 
-    template<typename T>
-    Array<T> shift(const Array<T> &in, const int sdims[4])
-    {
-        const af::dim4 iDims = in.dims();
-        af::dim4 oDims = iDims;
-
-        Array<T> out = createEmptyArray<T>(oDims);
-
-        T* outPtr = out.get();
-        const T* inPtr = in.get();
-
-        const af::dim4 ist = in.strides();
-        const af::dim4 ost = out.strides();
-
-        int sdims_[4];
-        // Need to do this because we are mapping output to input in the kernel
-        for(int i = 0; i < 4; i++) {
-            // sdims_[i] will always be positive and always [0, oDims[i]].
-            // Negative shifts are converted to position by going the other way round
-            sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0);
-            assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]);
-        }
+template<typename T>
+Array<T> shift(const Array<T> &in, const int sdims[4])
+{
+    in.eval();
 
-        for(dim_t ow = 0; ow < oDims[3]; ow++) {
-            const int oW = ow * ost[3];
-            const int iw = simple_mod((ow + sdims_[3]), oDims[3]);
-            const int iW = iw * ist[3];
-            for(dim_t oz = 0; oz < oDims[2]; oz++) {
-                const int oZW = oW + oz * ost[2];
-                const int iz = simple_mod((oz + sdims_[2]), oDims[2]);
-                const int iZW = iW + iz * ist[2];
-                for(dim_t oy = 0; oy < oDims[1]; oy++) {
-                    const int oYZW = oZW + oy * ost[1];
-                    const int iy = simple_mod((oy + sdims_[1]), oDims[1]);
-                    const int iYZW = iZW + iy * ist[1];
-                    for(dim_t ox = 0; ox < oDims[0]; ox++) {
-                        const int oIdx = oYZW + ox;
-                        const int ix = simple_mod((ox + sdims_[0]), oDims[0]);
-                        const int iIdx = iYZW + ix;
+    Array<T> out = createEmptyArray<T>(in.dims());
+    const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]);
 
-                        outPtr[oIdx] = inPtr[iIdx];
-                    }
-                }
-            }
-        }
+    getQueue().enqueue(kernel::shift<T>, out, in, temp);
 
-        return out;
-    }
+    return out;
+}
 
 #define INSTANTIATE(T)                                                  \
     template Array<T> shift<T>(const Array<T> &in, const int sdims[4]); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/sift.cpp b/src/backend/cpu/sift.cpp
index 70bb11d1ae..0345e37485 100644
--- a/src/backend/cpu/sift.cpp
+++ b/src/backend/cpu/sift.cpp
@@ -21,8 +21,8 @@
 #include <cfloat>
 #include <vector>
 
-#ifdef AF_BUILD_SIFT
-#include <sift_nonfree.hpp>
+#ifdef AF_BUILD_NONFREE_SIFT
+#include <kernel/sift_nonfree.hpp>
 #endif
 
 using af::dim4;
@@ -39,7 +39,7 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
               const float img_scale, const float feature_ratio,
               const bool compute_GLOH)
 {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
     return sift_impl<T, convAccT>(x, y, score, ori, size, desc, in, n_layers,
                                   contrast_thr, edge_thr, init_sigma, double_input,
                                   img_scale, feature_ratio, compute_GLOH);
diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp
deleted file mode 100644
index 514a134c7d..0000000000
--- a/src/backend/cpu/sift_nonfree.hpp
+++ /dev/null
@@ -1,1193 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob@iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe@cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements.  Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims.  If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement.  It does not protect you from
-// committing patent infringement.  So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either.  If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//     * Redistributions of source code must retain the above copyright and
-//       patent notices, this list of conditions and the following
-//       disclaimer.
-//     * Redistributions in binary form must reproduce the above copyright
-//       notice, this list of conditions and the following disclaimer in
-//       the documentation and/or other materials provided with the
-//       distribution.
-//     * Neither the name of Oregon State University nor the names of its
-//       contributors may be used to endorse or promote products derived
-//       from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-using af::dim4;
-
-namespace cpu
-{
-
-    static const float PI_VAL = 3.14159265358979323846f;
-
-// default width of descriptor histogram array
-    static const int DescrWidth = 4;
-
-// default number of bins per histogram in descriptor array
-    static const int DescrHistBins = 8;
-
-// assumed gaussian blur for input image
-    static const float InitSigma = 0.5f;
-
-// width of border in which to ignore keypoints
-    static const int ImgBorder = 5;
-
-// maximum steps of keypoint interpolation before failure
-    static const int MaxInterpSteps = 5;
-
-// default number of bins in histogram for orientation assignment
-    static const int OriHistBins = 36;
-
-// determines gaussian sigma for orientation assignment
-    static const float OriSigFctr = 1.5f;
-
-// determines the radius of the region used in orientation assignment */
-    static const float OriRadius = 3.0f * OriSigFctr;
-
-// number of passes of orientation histogram smoothing
-    static const int SmoothOriPasses = 2;
-
-// orientation magnitude relative to max that results in new feature
-    static const float OriPeakRatio = 0.8f;
-
-// determines the size of a single descriptor orientation histogram
-    static const float DescrSclFctr = 3.f;
-
-// threshold on magnitude of elements of descriptor vector
-    static const float DescrMagThr = 0.2f;
-
-// factor used to convert floating-point descriptor to unsigned char
-    static const float IntDescrFctr = 512.f;
-
-// Number of GLOH bins in radial direction
-    static const unsigned GLOHRadialBins = 3;
-
-// Radiuses of GLOH descriptors
-    static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
-
-// Number of GLOH angular bins (excluding the inner-most radial section)
-    static const unsigned GLOHAngularBins = 8;
-
-// Number of GLOH bins per histogram in descriptor
-    static const unsigned GLOHHistBins = 16;
-
-    typedef struct
-    {
-        float    f[4];
-        unsigned l;
-    } feat_t;
-
-    bool feat_cmp(feat_t i, feat_t j)
-    {
-        for (int k = 0; k < 4; k++)
-            if (i.f[k] != j.f[k])
-                return (i.f[k] < j.f[k]);
-        if (i.l != j.l)
-            return (i.l < j.l);
-
-        return true;
-    }
-
-    void array_to_feat(std::vector<feat_t>& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat)
-    {
-        feat.resize(nfeat);
-        for (unsigned i = 0; i < feat.size(); i++) {
-            feat[i].f[0] = x[i];
-            feat[i].f[1] = y[i];
-            feat[i].f[2] = resp[i];
-            feat[i].f[3] = size[i];
-            feat[i].l    = layer[i];
-        }
-    }
-
-    template<typename T>
-    void gaussian1D(T* out, const int dim, double sigma=0.0)
-    {
-        if(!(sigma>0)) sigma = 0.25*dim;
-
-        T sum = (T)0;
-        for(int i=0;i<dim;i++)
-        {
-            int x = i-(dim-1)/2;
-            T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-            out[i] = el;
-            sum   += el;
-        }
-
-        for(int k=0;k<dim;k++)
-            out[k] /= sum;
-    }
-
-    template<typename T>
-    Array<T> gauss_filter(float sigma)
-    {
-        // Using 6-sigma rule
-        unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
-
-        Array<T> filter = createEmptyArray<T>(gauss_len);
-        gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma);
-
-        return filter;
-    }
-
-    template<int N>
-    void gaussianElimination(float* A, float* b, float* x)
-    {
-        // forward elimination
-        for (int i = 0; i < N-1; i++) {
-            for (int j = i+1; j < N; j++) {
-                float s = A[j*N+i] / A[i*N+i];
-
-                for (int k = i; k < N; k++)
-                    A[j*N+k] -= s * A[i*N+k];
-
-                b[j] -= s * b[i];
-            }
-        }
-
-        for (int i = 0; i < N; i++)
-            x[i] = 0;
-
-        // backward substitution
-        float sum = 0;
-        for (int i = 0; i <= N-2; i++) {
-            sum = b[i];
-            for (int j = i+1; j < N; j++)
-                sum -= A[i*N+j] * x[j];
-            x[i] = sum / A[i*N+i];
-        }
-    }
-
-    template<typename T>
-    void sub(
-        Array<T>& out,
-        const Array<T>& in1,
-        const Array<T>& in2)
-    {
-        size_t nel = in1.elements();
-        T* out_ptr = out.get();
-        const T* in1_ptr = in1.get();
-        const T* in2_ptr = in2.get();
-
-        for (size_t i = 0; i < nel; i++) {
-            out_ptr[i] = in1_ptr[i] - in2_ptr[i];
-        }
-    }
-
-#define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)])
-#define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)])
-#define NPTR(Y, X) (next_ptr[(Y) * idims[0] + (X)])
-
-// Determines whether a pixel is a scale-space extremum by comparing it to its
-// 3x3x3 pixel neighborhood.
-    template<typename T>
-    void detectExtrema(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        unsigned* counter,
-        const Array<T>& prev,
-        const Array<T>& center,
-        const Array<T>& next,
-        const unsigned layer,
-        const unsigned max_feat,
-        const float threshold)
-    {
-        const af::dim4 idims = center.dims();
-        const T* prev_ptr    = prev.get();
-        const T* center_ptr  = center.get();
-        const T* next_ptr    = next.get();
-
-        for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) {
-            for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) {
-                float p = center_ptr[y*idims[0] + x];
-
-                // Find extrema
-                if (abs((float)p) > threshold &&
-                    ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) &&
-                      p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y,   x+1)  &&
-                      p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1)  &&
-                      p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1)  &&
-                      p > PPTR(y,   x-1) && p > PPTR(y  , x) && p > PPTR(y,   x+1)  &&
-                      p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1)  &&
-                      p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1)  &&
-                      p > NPTR(y,   x-1) && p > NPTR(y  , x) && p > NPTR(y,   x+1)  &&
-                      p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) ||
-                     (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) &&
-                      p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y,   x+1)  &&
-                      p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1)  &&
-                      p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1)  &&
-                      p < PPTR(y,   x-1) && p < PPTR(y  , x) && p < PPTR(y,   x+1)  &&
-                      p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1)  &&
-                      p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1)  &&
-                      p < NPTR(y,   x-1) && p < NPTR(y  , x) && p < NPTR(y,   x+1)  &&
-                      p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) {
-
-                    if (*counter < max_feat)
-                    {
-                        x_out[*counter] = (float)y;
-                        y_out[*counter] = (float)x;
-                        layer_out[*counter] = layer;
-                        (*counter)++;
-                    }
-                }
-            }
-        }
-    }
-
-// Interpolates a scale-space extremum's location and scale to subpixel
-// accuracy to form an image feature. Rejects features with low contrast.
-// Based on Section 4 of Lowe's paper.
-    template<typename T>
-    void interpolateExtrema(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        unsigned* counter,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const unsigned extrema_feat,
-        std::vector< Array<T> >& dog_pyr,
-        const unsigned max_feat,
-        const unsigned octave,
-        const unsigned n_layers,
-        const float contrast_thr,
-        const float edge_thr,
-        const float sigma,
-        const float img_scale)
-    {
-        for (int f = 0; f < (int)extrema_feat; f++) {
-            const float first_deriv_scale = img_scale*0.5f;
-            const float second_deriv_scale = img_scale;
-            const float cross_deriv_scale = img_scale*0.25f;
-
-            float xl = 0, xy = 0, xx = 0, contr = 0;
-            int i = 0;
-
-            unsigned x = x_in[f];
-            unsigned y = y_in[f];
-            unsigned layer = layer_in[f];
-
-            const T* prev_ptr   = dog_pyr[octave*(n_layers+2) + layer-1].get();
-            const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get();
-            const T* next_ptr   = dog_pyr[octave*(n_layers+2) + layer+1].get();
-
-            af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims();
-
-            bool converges = true;
-
-            for (i = 0; i < MaxInterpSteps; i++) {
-                float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                               (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                               (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-
-                float d2  = CPTR(x, y) * 2.f;
-                float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-                float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-                float dss = (NPTR(x, y  ) + PPTR(x, y  ) - d2) * second_deriv_scale;
-                float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                             CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-                float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
-                             PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
-                float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
-                             PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
-                float H[9] = {dxx, dxy, dxs,
-                              dxy, dyy, dys,
-                              dxs, dys, dss};
-
-                float X[3];
-                gaussianElimination<3>(H, dD, X);
-
-                xl = -X[2];
-                xy = -X[1];
-                xx = -X[0];
-
-                if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
-                    break;
-
-                x += round(xx);
-                y += round(xy);
-                layer += round(xl);
-
-                if (layer < 1 || layer > n_layers ||
-                    x < ImgBorder || x >= idims[1] - ImgBorder ||
-                    y < ImgBorder || y >= idims[0] - ImgBorder) {
-                    converges = false;
-                    break;
-                }
-            }
-
-            // ensure convergence of interpolation
-            if (i >= MaxInterpSteps || !converges)
-                continue;
-
-            float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                           (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                           (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-            float X[3] = {xx, xy, xl};
-
-            float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
-            contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f;
-            if(abs(contr) < (contrast_thr / n_layers))
-                continue;
-
-            // principal curvatures are computed using the trace and det of Hessian
-            float d2  = CPTR(x, y) * 2.f;
-            float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-            float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-            float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                         CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-
-            float tr = dxx + dyy;
-            float det = dxx * dyy - dxy * dxy;
-
-            // add FLT_EPSILON for double-precision compatibility
-            if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
-                continue;
-
-            if (*counter < max_feat)
-            {
-                x_out[*counter] = (x + xx) * (1 << octave);
-                y_out[*counter] = (y + xy) * (1 << octave);
-                layer_out[*counter] = layer;
-                response_out[*counter] = abs(contr);
-                size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f;
-                (*counter)++;
-            }
-        }
-    }
-
-#undef CPTR
-#undef PPTR
-#undef NPTR
-
-// Remove duplicate keypoints
-    void removeDuplicates(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        unsigned* counter,
-        const std::vector<feat_t>& sorted_feat)
-    {
-        size_t nfeat = sorted_feat.size();
-
-        for (size_t f = 0; f < nfeat; f++) {
-            float prec_fctr = 1e4f;
-
-            if (f < nfeat-1) {
-                if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) &&
-                    round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) &&
-                    round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) &&
-                    round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) &&
-                    sorted_feat[f].l == sorted_feat[f+1].l)
-                    continue;
-            }
-
-            x_out[*counter] = sorted_feat[f].f[0];
-            y_out[*counter] = sorted_feat[f].f[1];
-            response_out[*counter] = sorted_feat[f].f[2];
-            size_out[*counter] = sorted_feat[f].f[3];
-            layer_out[*counter] = sorted_feat[f].l;
-            (*counter)++;
-        }
-    }
-
-#define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)])
-
-// Computes a canonical orientation for each image feature in an array.  Based
-// on Section 5 of Lowe's paper.  This function adds features to the array when
-// there is more than one dominant orientation at a given feature location.
-    template<typename T>
-    void calcOrientation(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        float* ori_out,
-        unsigned* counter,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const float* response_in,
-        const float* size_in,
-        const unsigned total_feat,
-        const std::vector< Array<T> >& gauss_pyr,
-        const unsigned max_feat,
-        const unsigned octave,
-        const unsigned n_layers,
-        const bool double_input)
-    {
-        const int n = OriHistBins;
-
-        float hist[OriHistBins];
-        float temphist[OriHistBins];
-
-        for (unsigned f = 0; f < total_feat; f++) {
-            // Load keypoint information
-            const float real_x = x_in[f];
-            const float real_y = y_in[f];
-            const unsigned layer = layer_in[f];
-            const float response = response_in[f];
-            const float size = size_in[f];
-
-            const int pt_x = (int)round(real_x / (1 << octave));
-            const int pt_y = (int)round(real_y / (1 << octave));
-
-            // Calculate auxiliary parameters
-            const float scl_octv = size*0.5f / (1 << octave);
-            const int radius = (int)round(OriRadius * scl_octv);
-            const float sigma = OriSigFctr * scl_octv;
-            const int len = (radius*2+1);
-            const float exp_denom = 2.f * sigma * sigma;
-
-            // Points img to correct Gaussian pyramid layer
-            const Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
-            const T* img_ptr = img.get();
-
-            for (int i = 0; i < OriHistBins; i++)
-                hist[i] = 0.f;
-
-            af::dim4 idims = img.dims();
-
-            // Calculate orientation histogram
-            for (int l = 0; l < len*len; l++) {
-                int i = l / len - radius;
-                int j = l % len - radius;
-
-                int y = pt_y + i;
-                int x = pt_x + j;
-                if (y < 1 || y >= idims[0] - 1 ||
-                    x < 1 || x >= idims[1] - 1)
-                    continue;
-
-                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                float mag = sqrt(dx*dx+dy*dy);
-                float ori = atan2(dy,dx);
-                float w = exp(-(i*i + j*j)/exp_denom);
-
-                int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
-                bin = bin < n ? bin : 0;
-
-                hist[bin] += w*mag;
-            }
-
-            for (int i = 0; i < SmoothOriPasses; i++) {
-                for (int j = 0; j < n; j++) {
-                    temphist[j] = hist[j];
-                }
-                for (int j = 0; j < n; j++) {
-                    float prev = (j == 0) ? temphist[n-1] : temphist[j-1];
-                    float next = (j+1 == n) ? temphist[0] : temphist[j+1];
-                    hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next;
-                }
-            }
-
-            float omax = hist[0];
-            for (int i = 1; i < n; i++)
-                omax = max(omax, hist[i]);
-
-            float mag_thr = (float)(omax * OriPeakRatio);
-            int l, r;
-            for (int j = 0; j < n; j++) {
-                l = (j == 0) ? n - 1 : j - 1;
-                r = (j + 1) % n;
-                if (hist[j] > hist[l] &&
-                    hist[j] > hist[r] &&
-                    hist[j] >= mag_thr) {
-                    if (*counter < max_feat) {
-                        float bin = j + 0.5f * (hist[l] - hist[r]) /
-                            (hist[l] - 2.0f*hist[j] + hist[r]);
-                        bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
-                        float ori = 360.f - ((360.f/n) * bin);
-
-                        float new_real_x = real_x;
-                        float new_real_y = real_y;
-                        float new_size = size;
-
-                        if (double_input) {
-                            float scale = 0.5f;
-                            new_real_x *= scale;
-                            new_real_y *= scale;
-                            new_size *= scale;
-                        }
-
-                        x_out[*counter] = new_real_x;
-                        y_out[*counter] = new_real_y;
-                        layer_out[*counter] = layer;
-                        response_out[*counter] = response;
-                        size_out[*counter] = new_size;
-                        ori_out[*counter] = ori;
-                        (*counter)++;
-                    }
-                }
-            }
-        }
-    }
-
-    void normalizeDesc(
-        float* desc,
-        const int histlen)
-    {
-        float len_sq = 0.0f;
-
-        for (int i = 0; i < histlen; i++)
-            len_sq += desc[i] * desc[i];
-
-        float len_inv = 1.0f / sqrt(len_sq);
-
-        for (int i = 0; i < histlen; i++) {
-            desc[i] *= len_inv;
-        }
-    }
-
-// Computes feature descriptors for features in an array.  Based on Section 6
-// of Lowe's paper.
-    template<typename T>
-    void computeDescriptor(
-        float* desc_out,
-        const unsigned desc_len,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const float* response_in,
-        const float* size_in,
-        const float* ori_in,
-        const unsigned total_feat,
-        const std::vector< Array<T> >& gauss_pyr,
-        const int d,
-        const int n,
-        const float scale,
-        const unsigned octave,
-        const unsigned n_layers)
-    {
-        float desc[128];
-
-        for (unsigned f = 0; f < total_feat; f++) {
-            const unsigned layer = layer_in[f];
-            float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
-            ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
-            const float size = size_in[f];
-            const int fx = round(x_in[f] * scale);
-            const int fy = round(y_in[f] * scale);
-
-            // Points img to correct Gaussian pyramid layer
-            Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
-            const T* img_ptr = img.get();
-            af::dim4 idims = img.dims();
-
-            float cos_t = cos(ori);
-            float sin_t = sin(ori);
-            float bins_per_rad = n / (PI_VAL * 2.f);
-            float exp_denom = d * d * 0.5f;
-            float hist_width = DescrSclFctr * size * scale * 0.5f;
-            int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
-            int len = radius*2+1;
-
-            for (int i = 0; i < (int)desc_len; i++)
-                desc[i] = 0.f;
-
-            // Calculate orientation histogram
-            for (int l = 0; l < len*len; l++) {
-                int i = l / len - radius;
-                int j = l % len - radius;
-
-                int y = fy + i;
-                int x = fx + j;
-
-                float x_rot = (j * cos_t - i * sin_t) / hist_width;
-                float y_rot = (j * sin_t + i * cos_t) / hist_width;
-                float xbin = x_rot + d/2 - 0.5f;
-                float ybin = y_rot + d/2 - 0.5f;
-
-                if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
-                    y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
-                    float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                    float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                    float grad_mag = sqrt(dx*dx + dy*dy);
-                    float grad_ori = atan2(dy, dx) - ori;
-                    while (grad_ori < 0.0f)
-                        grad_ori += PI_VAL*2;
-                    while (grad_ori >= PI_VAL*2)
-                        grad_ori -= PI_VAL*2;
-
-                    float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
-                    float obin = grad_ori * bins_per_rad;
-                    float mag = grad_mag*w;
-
-                    int x0 = floor(xbin);
-                    int y0 = floor(ybin);
-                    int o0 = floor(obin);
-                    xbin -= x0;
-                    ybin -= y0;
-                    obin -= o0;
-
-                    for (int yl = 0; yl <= 1; yl++) {
-                        int yb = y0 + yl;
-                        if (yb >= 0 && yb < d) {
-                            float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
-                            for (int xl = 0; xl <= 1; xl++) {
-                                int xb = x0 + xl;
-                                if (xb >= 0 && xb < d) {
-                                    float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
-                                    for (int ol = 0; ol <= 1; ol++) {
-                                        int ob = (o0 + ol) % n;
-                                        float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
-                                        desc[(yb*d + xb)*n + ob] += v_o;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-
-            normalizeDesc(desc, desc_len);
-
-            for (int i = 0; i < (int)desc_len; i++)
-                desc[i] = min(desc[i], DescrMagThr);
-
-            normalizeDesc(desc, desc_len);
-
-            // Calculate final descriptor values
-            for (int k = 0; k < (int)desc_len; k++) {
-                desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
-            }
-        }
-    }
-
-// Computes GLOH feature descriptors for features in an array. Based on Section III-B
-// of Mikolajczyk and Schmid paper.
-    template<typename T>
-    void computeGLOHDescriptor(
-        float* desc_out,
-        const unsigned desc_len,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const float* response_in,
-        const float* size_in,
-        const float* ori_in,
-        const unsigned total_feat,
-        const std::vector< Array<T> >& gauss_pyr,
-        const int d,
-        const unsigned rb,
-        const unsigned ab,
-        const unsigned hb,
-        const float scale,
-        const unsigned octave,
-        const unsigned n_layers)
-    {
-        float desc[272];
-
-        for (unsigned f = 0; f < total_feat; f++) {
-            const unsigned layer = layer_in[f];
-            float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
-            ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
-            const float size = size_in[f];
-            const int fx = round(x_in[f] * scale);
-            const int fy = round(y_in[f] * scale);
-
-            // Points img to correct Gaussian pyramid layer
-            Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
-            const T* img_ptr = img.get();
-            af::dim4 idims = img.dims();
-
-            float cos_t = cos(ori);
-            float sin_t = sin(ori);
-            float hist_bins_per_rad = hb / (PI_VAL * 2.f);
-            float polar_bins_per_rad = ab / (PI_VAL * 2.f);
-            float exp_denom = GLOHRadii[rb-1] * 0.5f;
-
-            float hist_width = DescrSclFctr * size * scale * 0.5f;
-
-            // Keep same descriptor radius used for SIFT
-            int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
-            // Alternative radius size calculation, changing the radius weight
-            // (rw) in the range of 0.25f-0.75f gives different results,
-            // increasing it tends to show a better recall rate but with a
-            // smaller amount of correct matches
-            //float rw = 0.5f;
-            //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
-
-            int len = radius*2+1;
-
-            for (int i = 0; i < (int)desc_len; i++)
-                desc[i] = 0.f;
-
-            // Calculate orientation histogram
-            for (int l = 0; l < len*len; l++) {
-                int i = l / len - radius;
-                int j = l % len - radius;
-
-                int y = fy + i;
-                int x = fx + j;
-
-                float x_rot = (j * cos_t - i * sin_t);
-                float y_rot = (j * sin_t + i * cos_t);
-
-                float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
-                float theta = atan2(y_rot, x_rot);
-                while (theta < 0.0f)
-                    theta += PI_VAL*2;
-                while (theta >= PI_VAL*2)
-                    theta -= PI_VAL*2;
-
-                float tbin = theta * polar_bins_per_rad;
-                float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
-                             ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
-                             min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
-
-                if (r <= GLOHRadii[rb-1] &&
-                    y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
-                    float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                    float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                    float grad_mag = sqrt(dx*dx + dy*dy);
-                    float grad_ori = atan2(dy, dx) - ori;
-                    while (grad_ori < 0.0f)
-                        grad_ori += PI_VAL*2;
-                    while (grad_ori >= PI_VAL*2)
-                        grad_ori -= PI_VAL*2;
-
-                    float w = exp(-r / exp_denom);
-                    float obin = grad_ori * hist_bins_per_rad;
-                    float mag = grad_mag*w;
-
-                    int t0 = floor(tbin);
-                    int r0 = floor(rbin);
-                    int o0 = floor(obin);
-                    tbin -= t0;
-                    rbin -= r0;
-                    obin -= o0;
-
-                    for (int rl = 0; rl <= 1; rl++) {
-                        int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
-                        float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
-                        if (rb >= 0 && rb <= 2) {
-                            for (int tl = 0; tl <= 1; tl++) {
-                                int tb = (t0 + tl) % ab;
-                                float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
-                                for (int ol = 0; ol <= 1; ol++) {
-                                    int ob = (o0 + ol) % hb;
-                                    float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
-                                    unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
-                                    desc[idx] += v_o;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-
-            normalizeDesc(desc, desc_len);
-
-            for (int i = 0; i < (int)desc_len; i++)
-                desc[i] = min(desc[i], DescrMagThr);
-
-            normalizeDesc(desc, desc_len);
-
-            // Calculate final descriptor values
-            for (int k = 0; k < (int)desc_len; k++) {
-                desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
-            }
-        }
-    }
-
-#undef IPTR
-
-    template<typename T, typename convAccT>
-    Array<T> createInitialImage(
-        const Array<T>& img,
-        const float init_sigma,
-        const bool double_input)
-    {
-        af::dim4 idims = img.dims();
-
-        Array<T> init_img = createEmptyArray<T>(af::dim4());
-
-        float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f)
-                                 : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f);
-
-        Array<T> filter = gauss_filter<T>(s);
-
-        if (double_input) {
-            Array<T> double_img = resize<T>(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR);
-            init_img = convolve2<T, convAccT, false>(double_img, filter, filter);
-        }
-        else {
-            init_img = convolve2<T, convAccT, false>(img, filter, filter);
-        }
-
-        return init_img;
-    }
-
-    template<typename T, typename convAccT>
-    std::vector< Array<T> > buildGaussPyr(
-        const Array<T>& init_img,
-        const unsigned n_octaves,
-        const unsigned n_layers,
-        const float init_sigma)
-    {
-        // Precompute Gaussian sigmas using the following formula:
-        // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
-        std::vector<float> sig_layers(n_layers + 3);
-        sig_layers[0] = init_sigma;
-        float k = std::pow(2.0f, 1.0f / n_layers);
-        for (unsigned i = 1; i < n_layers + 3; i++) {
-            float sig_prev = std::pow(k, i-1) * init_sigma;
-            float sig_total = sig_prev * k;
-            sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
-        }
-
-        // Gaussian Pyramid
-        std::vector< Array<T> > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray<T>(af::dim4()));
-        for (unsigned o = 0; o < n_octaves; o++) {
-            for (unsigned l = 0; l < n_layers+3; l++) {
-                unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
-                unsigned idx = o*(n_layers+3) + l;
-
-                if (o == 0 && l == 0) {
-                    gauss_pyr[idx] = init_img;
-                }
-                else if (l == 0) {
-                    af::dim4 sdims = gauss_pyr[src_idx].dims();
-                    gauss_pyr[idx] = resize<T>(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR);
-                }
-                else {
-                    Array<T> filter = gauss_filter<T>(sig_layers[l]);
-
-                    gauss_pyr[idx] = convolve2<T, convAccT, false>(gauss_pyr[src_idx], filter, filter);
-                }
-            }
-        }
-
-        return gauss_pyr;
-    }
-
-    template<typename T>
-    std::vector< Array<T> > buildDoGPyr(
-        std::vector< Array<T> >& gauss_pyr,
-        const unsigned n_octaves,
-        const unsigned n_layers)
-    {
-        // DoG Pyramid
-        std::vector< Array<T> > dog_pyr(n_octaves * (n_layers+2), createEmptyArray<T>(af::dim4()));
-        for (unsigned o = 0; o < n_octaves; o++) {
-            for (unsigned l = 0; l < n_layers+2; l++) {
-                unsigned idx    = o*(n_layers+2) + l;
-                unsigned bottom = o*(n_layers+3) + l;
-                unsigned top    = o*(n_layers+3) + l+1;
-
-                dog_pyr[idx] = createEmptyArray<T>(gauss_pyr[bottom].dims());
-
-                sub<T>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
-            }
-        }
-
-        return dog_pyr;
-    }
-
-
-    template<typename T, typename convAccT>
-    unsigned sift_impl(Array<float>& x, Array<float>& y, Array<float>& score,
-                       Array<float>& ori, Array<float>& size, Array<float>& desc,
-                       const Array<T>& in, const unsigned n_layers,
-                       const float contrast_thr, const float edge_thr,
-                       const float init_sigma, const bool double_input,
-                       const float img_scale, const float feature_ratio,
-                       const bool compute_GLOH)
-    {
-        af::dim4 idims = in.dims();
-
-        const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2)
-            : min(idims[0], idims[1]);
-        const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
-
-        Array<T> init_img = createInitialImage<T, convAccT>(in, init_sigma, double_input);
-
-        std::vector< Array<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
-
-        std::vector< Array<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
-
-        std::vector<float*> x_pyr(n_octaves, NULL);
-        std::vector<float*> y_pyr(n_octaves, NULL);
-        std::vector<float*> response_pyr(n_octaves, NULL);
-        std::vector<float*> size_pyr(n_octaves, NULL);
-        std::vector<float*> ori_pyr(n_octaves, NULL);
-        std::vector<float*> desc_pyr(n_octaves, NULL);
-        std::vector<unsigned> feat_pyr(n_octaves, 0);
-        unsigned total_feat = 0;
-
-        const unsigned d = DescrWidth;
-        const unsigned n = DescrHistBins;
-        const unsigned rb = GLOHRadialBins;
-        const unsigned ab = GLOHAngularBins;
-        const unsigned hb = GLOHHistBins;
-        const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
-
-        for (unsigned i = 0; i < n_octaves; i++) {
-            af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
-            if (ddims[0]-2*ImgBorder < 1 ||
-                ddims[1]-2*ImgBorder < 1)
-                continue;
-
-            const unsigned imel = ddims[0] * ddims[1];
-            const unsigned max_feat = ceil(imel * feature_ratio);
-
-            float* extrema_x = memAlloc<float>(max_feat);
-            float* extrema_y = memAlloc<float>(max_feat);
-            unsigned* extrema_layer = memAlloc<unsigned>(max_feat);
-            unsigned extrema_feat = 0;
-
-            for (unsigned j = 1; j <= n_layers; j++) {
-                unsigned prev   = i*(n_layers+2) + j-1;
-                unsigned center = i*(n_layers+2) + j;
-                unsigned next   = i*(n_layers+2) + j+1;
-
-                unsigned layer = j;
-
-                float extrema_thr = 0.5f * contrast_thr / n_layers;
-                detectExtrema<T>(extrema_x, extrema_y, extrema_layer, &extrema_feat,
-                                 dog_pyr[prev], dog_pyr[center], dog_pyr[next],
-                                 layer, max_feat, extrema_thr);
-            }
-
-            extrema_feat = min(extrema_feat, max_feat);
-
-            if (extrema_feat == 0) {
-                memFree(extrema_x);
-                memFree(extrema_y);
-                memFree(extrema_layer);
-
-                continue;
-            }
-
-            unsigned interp_feat = 0;
-
-            float* interp_x = memAlloc<float>(extrema_feat);
-            float* interp_y = memAlloc<float>(extrema_feat);
-            unsigned* interp_layer = memAlloc<unsigned>(extrema_feat);
-            float* interp_response = memAlloc<float>(extrema_feat);
-            float* interp_size = memAlloc<float>(extrema_feat);
-
-            interpolateExtrema<T>(interp_x, interp_y, interp_layer,
-                                  interp_response, interp_size, &interp_feat,
-                                  extrema_x, extrema_y, extrema_layer, extrema_feat,
-                                  dog_pyr, max_feat, i, n_layers,
-                                  contrast_thr, edge_thr, init_sigma, img_scale);
-
-            interp_feat = min(interp_feat, max_feat);
-
-            if (interp_feat == 0) {
-                memFree(interp_x);
-                memFree(interp_y);
-                memFree(interp_layer);
-                memFree(interp_response);
-                memFree(interp_size);
-
-                continue;
-            }
-
-            std::vector<feat_t> sorted_feat;
-            array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
-            std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
-
-            memFree(interp_x);
-            memFree(interp_y);
-            memFree(interp_layer);
-            memFree(interp_response);
-            memFree(interp_size);
-
-            unsigned nodup_feat = 0;
-
-            float* nodup_x = memAlloc<float>(interp_feat);
-            float* nodup_y = memAlloc<float>(interp_feat);
-            unsigned* nodup_layer = memAlloc<unsigned>(interp_feat);
-            float* nodup_response = memAlloc<float>(interp_feat);
-            float* nodup_size = memAlloc<float>(interp_feat);
-
-            removeDuplicates(nodup_x, nodup_y, nodup_layer,
-                             nodup_response, nodup_size, &nodup_feat,
-                             sorted_feat);
-
-            const unsigned max_oriented_feat = nodup_feat * 3;
-
-            float* oriented_x = memAlloc<float>(max_oriented_feat);
-            float* oriented_y = memAlloc<float>(max_oriented_feat);
-            unsigned* oriented_layer = memAlloc<unsigned>(max_oriented_feat);
-            float* oriented_response = memAlloc<float>(max_oriented_feat);
-            float* oriented_size = memAlloc<float>(max_oriented_feat);
-            float* oriented_ori = memAlloc<float>(max_oriented_feat);
-
-            unsigned oriented_feat = 0;
-
-            calcOrientation<T>(oriented_x, oriented_y, oriented_layer,
-                               oriented_response, oriented_size, oriented_ori, &oriented_feat,
-                               nodup_x, nodup_y, nodup_layer,
-                               nodup_response, nodup_size, nodup_feat,
-                               gauss_pyr, max_oriented_feat, i, n_layers, double_input);
-
-            memFree(nodup_x);
-            memFree(nodup_y);
-            memFree(nodup_layer);
-            memFree(nodup_response);
-            memFree(nodup_size);
-
-            if (oriented_feat == 0) {
-                memFree(oriented_x);
-                memFree(oriented_y);
-                memFree(oriented_layer);
-                memFree(oriented_response);
-                memFree(oriented_size);
-                memFree(oriented_ori);
-
-                continue;
-            }
-
-            float* desc = memAlloc<float>(oriented_feat * desc_len);
-
-            float scale = 1.f/(1 << i);
-            if (double_input) scale *= 2.f;
-
-            if (compute_GLOH)
-                computeGLOHDescriptor<T>(desc, desc_len,
-                                         oriented_x, oriented_y, oriented_layer,
-                                         oriented_response, oriented_size, oriented_ori,
-                                         oriented_feat, gauss_pyr, d, rb, ab, hb,
-                                         scale, i, n_layers);
-            else
-                computeDescriptor<T>(desc, desc_len,
-                                     oriented_x, oriented_y, oriented_layer,
-                                     oriented_response, oriented_size, oriented_ori,
-                                     oriented_feat, gauss_pyr, d, n, scale, i, n_layers);
-
-            total_feat += oriented_feat;
-            feat_pyr[i] = oriented_feat;
-
-            if (oriented_feat > 0) {
-                x_pyr[i] = oriented_x;
-                y_pyr[i] = oriented_y;
-                response_pyr[i] = oriented_response;
-                ori_pyr[i] = oriented_ori;
-                size_pyr[i] = oriented_size;
-                desc_pyr[i] = desc;
-            }
-        }
-
-        if (total_feat > 0) {
-            const af::dim4 total_feat_dims(total_feat);
-            const af::dim4 desc_dims(desc_len, total_feat);
-
-            // Allocate output memory
-            x     = createEmptyArray<float>(total_feat_dims);
-            y     = createEmptyArray<float>(total_feat_dims);
-            score = createEmptyArray<float>(total_feat_dims);
-            ori   = createEmptyArray<float>(total_feat_dims);
-            size  = createEmptyArray<float>(total_feat_dims);
-            desc  = createEmptyArray<float>(desc_dims);
-
-            float* x_ptr = x.get();
-            float* y_ptr = y.get();
-            float* score_ptr = score.get();
-            float* ori_ptr = ori.get();
-            float* size_ptr = size.get();
-            float* desc_ptr = desc.get();
-
-            unsigned offset = 0;
-            for (unsigned i = 0; i < n_octaves; i++) {
-                if (feat_pyr[i] == 0)
-                    continue;
-
-                memcpy(x_ptr+offset,     x_pyr[i],        feat_pyr[i] * sizeof(float));
-                memcpy(y_ptr+offset,     y_pyr[i],        feat_pyr[i] * sizeof(float));
-                memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float));
-                memcpy(ori_ptr+offset,   ori_pyr[i],      feat_pyr[i] * sizeof(float));
-                memcpy(size_ptr+offset,  size_pyr[i],     feat_pyr[i] * sizeof(float));
-
-                memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float));
-
-                memFree(x_pyr[i]);
-                memFree(y_pyr[i]);
-                memFree(response_pyr[i]);
-                memFree(ori_pyr[i]);
-                memFree(size_pyr[i]);
-                memFree(desc_pyr[i]);
-
-                offset += feat_pyr[i];
-            }
-        }
-
-        return total_feat;
-    }
-}
diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp
index 3c6b1740d5..5ece9bf65e 100644
--- a/src/backend/cpu/sobel.cpp
+++ b/src/backend/cpu/sobel.cpp
@@ -13,88 +13,32 @@
 #include <Array.hpp>
 #include <sobel.hpp>
 #include <convolve.hpp>
-#include <err_cpu.hpp>
-#include <utility>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/sobel.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<typename Ti, typename To, bool isDX>
-void derivative(To *optr, Ti const *iptr, dim4 const &dims, dim4 const &strides)
-{
-    for(dim_t b3=0; b3<dims[3]; ++b3) {
-    for(dim_t b2=0; b2<dims[2]; ++b2) {
-
-        for(dim_t j=0; j<dims[1]; ++j) {
-
-            int joff  = j;
-            int _joff = j-1;
-            int joff_ = j+1;
-            int joffset = j*strides[1];
-
-            for(dim_t i=0; i<dims[0]; ++i) {
-
-                To accum = To(0);
-
-                int  ioff = i;
-                int _ioff = i-1;
-                int ioff_ = i+1;
-
-                To NW = (_ioff>=0 && _joff>=0) ?
-                        iptr[_joff*strides[1]+_ioff*strides[0]] : 0;
-                To SW = (ioff_<(int)dims[0] && _joff>=0) ?
-                        iptr[_joff*strides[1]+ioff_*strides[0]] : 0;
-                To NE = (_ioff>=0 && joff_<(int)dims[1]) ?
-                        iptr[joff_*strides[1]+_ioff*strides[0]] : 0;
-                To SE = (ioff_<(int)dims[0] && joff_<(int)dims[1]) ?
-                        iptr[joff_*strides[1]+ioff_*strides[0]] : 0;
-
-                if (isDX) {
-                    To W  = _joff>=0 ?
-                            iptr[_joff*strides[1]+ioff*strides[0]] : 0;
-
-                    To E  = joff_<(int)dims[1] ?
-                            iptr[joff_*strides[1]+ioff*strides[0]] : 0;
-
-                    accum = NW+SW - (NE+SE) + 2*(W-E);
-                } else {
-                    To N  = _ioff>=0 ?
-                            iptr[joff*strides[1]+_ioff*strides[0]] : 0;
-
-                    To S  = ioff_<(int)dims[0] ?
-                            iptr[joff*strides[1]+ioff_*strides[0]] : 0;
-
-                    accum = NW+NE - (SW+SE) + 2*(N-S);
-                }
-
-                optr[joffset+i*strides[0]] = accum;
-            }
-        }
-
-        optr += strides[2];
-        iptr += strides[2];
-    }
-    optr += strides[3];
-    iptr += strides[3];
-    }
-}
-
 template<typename Ti, typename To>
 std::pair< Array<To>, Array<To> >
 sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
 {
+    img.eval();
+    // ket_size is for future proofing, this argument is not used
+    // currently
     Array<To> dx = createEmptyArray<To>(img.dims());
     Array<To> dy = createEmptyArray<To>(img.dims());
 
-    derivative<Ti, To, true >(dx.get(), img.get(), img.dims(), img.strides());
-    derivative<Ti, To, false>(dy.get(), img.get(), img.dims(), img.strides());
+    getQueue().enqueue(kernel::derivative<Ti, To, true >, dx, img);
+    getQueue().enqueue(kernel::derivative<Ti, To, false>, dy, img);
 
     return std::make_pair(dx, dy);
 }
 
-#define INSTANTIATE(Ti, To)                                                 \
+#define INSTANTIATE(Ti, To)                                               \
     template std::pair< Array<To>, Array<To> >                            \
     sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size);
 
diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp
index 1e88e8d915..367afa3884 100644
--- a/src/backend/cpu/solve.cpp
+++ b/src/backend/cpu/solve.cpp
@@ -11,52 +11,40 @@
 #include <err_common.hpp>
 
 #if defined(WITH_CPU_LINEAR_ALGEBRA)
-
 #include <af/dim4.hpp>
 #include <handle.hpp>
-#include <range.hpp>
-#include <iostream>
 #include <cassert>
 #include <err_cpu.hpp>
-
 #include <lapack_helper.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
 
 template<typename T>
 using gesv_func_def = int (*)(ORDER_TYPE, int, int,
-                              T *, int,
-                              int *,
-                              T *, int);
+                              T *, int, int *, T *, int);
 
 template<typename T>
-using gels_func_def = int (*)(ORDER_TYPE, char,
-                              int, int, int,
-                              T *, int,
-                              T *, int);
+using gels_func_def = int (*)(ORDER_TYPE, char, int, int, int,
+                              T *, int, T *, int);
 
 template<typename T>
-using getrs_func_def = int (*)(ORDER_TYPE, char,
-                               int, int,
-                               const T *, int,
-                               const int *,
-                               T *, int);
+using getrs_func_def = int (*)(ORDER_TYPE, char, int, int,
+                               const T *, int, const int *, T *, int);
 
 template<typename T>
-using trtrs_func_def = int (*)(ORDER_TYPE,
-                               char, char, char,
-                               int, int,
-                               const T *, int,
-                               T *, int);
+using trtrs_func_def = int (*)(ORDER_TYPE, char, char, char, int, int,
+                               const T *, int, T *, int);
 
 
-#define SOLVE_FUNC_DEF( FUNC )                                      \
+#define SOLVE_FUNC_DEF( FUNC )                                 \
 template<typename T> FUNC##_func_def<T> FUNC##_func();
 
 
-#define SOLVE_FUNC( FUNC, TYPE, PREFIX )                            \
-template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()            \
+#define SOLVE_FUNC( FUNC, TYPE, PREFIX )                       \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()       \
 { return & LAPACK_NAME(PREFIX##FUNC); }
 
 SOLVE_FUNC_DEF( gesv )
@@ -87,16 +75,20 @@ template<typename T>
 Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
                  const Array<T> &b, const af_mat_prop options)
 {
-    int N = A.dims()[0];
-    int NRHS = b.dims()[1];
+    A.eval();
+    pivot.eval();
+    b.eval();
 
+    int N        = A.dims()[0];
+    int NRHS     = b.dims()[1];
     Array< T > B = copyArray<T>(b);
 
-    getrs_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
-                    N, NRHS,
-                    A.get(), A.strides()[1],
-                    pivot.get(),
-                    B.get(), B.strides()[1]);
+    auto func = [=] (Array<T> A, Array<T> B, Array<int> pivot, int N, int NRHS) {
+        getrs_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
+                        N, NRHS, A.get(), A.strides()[1],
+                        pivot.get(), B.get(), B.strides()[1]);
+    };
+    getQueue().enqueue(func, A, B, pivot, N, NRHS);
 
     return B;
 }
@@ -104,17 +96,24 @@ Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
 template<typename T>
 Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options)
 {
+    A.eval();
+    b.eval();
+
     Array<T> B = copyArray<T>(b);
-    int N = B.dims()[0];
-    int NRHS = B.dims()[1];
-
-    trtrs_func<T>()(AF_LAPACK_COL_MAJOR,
-                    options & AF_MAT_UPPER ? 'U' : 'L',
-                    'N', // transpose flag
-                    options & AF_MAT_DIAG_UNIT ? 'U' : 'N',
-                    N, NRHS,
-                    A.get(), A.strides()[1],
-                    B.get(), B.strides()[1]);
+    int N      = B.dims()[0];
+    int NRHS   = B.dims()[1];
+
+    auto func = [=] (Array<T> A, Array<T> B, int N, int NRHS, const af_mat_prop options) {
+        trtrs_func<T>()(AF_LAPACK_COL_MAJOR,
+                        options & AF_MAT_UPPER ? 'U' : 'L',
+                        'N', // transpose flag
+                        options & AF_MAT_DIAG_UNIT ? 'U' : 'N',
+                        N, NRHS,
+                        A.get(), A.strides()[1],
+                        B.get(), B.strides()[1]);
+    };
+    getQueue().enqueue(func, A, B, N, NRHS, options);
+
     return B;
 }
 
@@ -122,9 +121,10 @@ Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop o
 template<typename T>
 Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
 {
+    a.eval();
+    b.eval();
 
-    if (options & AF_MAT_UPPER ||
-        options & AF_MAT_LOWER) {
+    if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) {
         return triangleSolve<T>(a, b, options);
     }
 
@@ -132,41 +132,34 @@ Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
     int N = a.dims()[1];
     int K = b.dims()[1];
 
-
     Array<T> A = copyArray<T>(a);
     Array<T> B = padArray<T, T>(b, dim4(max(M, N), K));
 
     if(M == N) {
         Array<int> pivot = createEmptyArray<int>(dim4(N, 1, 1));
-        gesv_func<T>()(AF_LAPACK_COL_MAJOR, N, K,
-                       A.get(), A.strides()[1],
-                       pivot.get(),
-                       B.get(), B.strides()[1]);
+
+        auto func = [=] (Array<T> A, Array<T> B, Array<int> pivot, int N, int K) {
+            gesv_func<T>()(AF_LAPACK_COL_MAJOR, N, K, A.get(), A.strides()[1],
+                           pivot.get(), B.get(), B.strides()[1]);
+        };
+        getQueue().enqueue(func, A, B, pivot, N, K);
     } else {
-        int sM = a.strides()[1];
-        int sN = a.strides()[2] / sM;
+        auto func = [=] (Array<T> A, Array<T> B, int M, int N, int K) {
+            int sM = A.strides()[1];
+            int sN = A.strides()[2] / sM;
 
-        gels_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
-                       M, N, K,
-                       A.get(), A.strides()[1],
-                       B.get(), max(sM, sN));
+            gels_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
+                    M, N, K,
+                    A.get(), A.strides()[1],
+                    B.get(), max(sM, sN));
+        };
         B.resetDims(dim4(N, K));
+        getQueue().enqueue(func, A, B, M, N, K);
     }
 
     return B;
 }
 
-#define INSTANTIATE_SOLVE(T)                                            \
-    template Array<T> solve<T>(const Array<T> &a, const Array<T> &b,    \
-                               const af_mat_prop options);              \
-    template Array<T> solveLU<T>(const Array<T> &A, const Array<int> &pivot, \
-                                 const Array<T> &b, const af_mat_prop options); \
-
-INSTANTIATE_SOLVE(float)
-INSTANTIATE_SOLVE(cfloat)
-INSTANTIATE_SOLVE(double)
-INSTANTIATE_SOLVE(cdouble)
-
 }
 
 #else
@@ -178,17 +171,22 @@ template<typename T>
 Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
                  const Array<T> &b, const af_mat_prop options)
 {
-    AF_ERROR("Linear Algebra is diabled on CPU",
-             AF_ERR_NOT_CONFIGURED);
+    AF_ERROR("Linear Algebra is diabled on CPU", AF_ERR_NOT_CONFIGURED);
 }
 
 template<typename T>
 Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
 {
-    AF_ERROR("Linear Algebra is diabled on CPU",
-              AF_ERR_NOT_CONFIGURED);
+    AF_ERROR("Linear Algebra is diabled on CPU", AF_ERR_NOT_CONFIGURED);
+}
+
 }
 
+#endif
+
+namespace cpu
+{
+
 #define INSTANTIATE_SOLVE(T)                                            \
     template Array<T> solve<T>(const Array<T> &a, const Array<T> &b,    \
                                const af_mat_prop options);              \
@@ -199,6 +197,5 @@ INSTANTIATE_SOLVE(float)
 INSTANTIATE_SOLVE(cfloat)
 INSTANTIATE_SOLVE(double)
 INSTANTIATE_SOLVE(cdouble)
-}
 
-#endif
+}
diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index 0b3fb9aabe..bc6396b258 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -11,78 +11,43 @@
 #include <sort.hpp>
 #include <math.hpp>
 #include <copy.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <algorithm>
 #include <functional>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/sort.hpp>
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Kernel Functions
-    ///////////////////////////////////////////////////////////////////////////
-
-    // Based off of http://stackoverflow.com/a/12399290
-    template<typename T, bool isAscending>
-    void sort0(Array<T> &val)
-    {
-        // initialize original index locations
-        T *val_ptr = val.get();
-
-        function<bool(T, T)> op = greater<T>();
-        if(isAscending) { op = less<T>(); }
-
-        T *comp_ptr = nullptr;
-        for(dim_t w = 0; w < val.dims()[3]; w++) {
-            dim_t valW = w * val.strides()[3];
-            for(dim_t z = 0; z < val.dims()[2]; z++) {
-                dim_t valWZ = valW + z * val.strides()[2];
-                for(dim_t y = 0; y < val.dims()[1]; y++) {
-
-                    dim_t valOffset = valWZ + y * val.strides()[1];
 
-                    comp_ptr = val_ptr + valOffset;
-                    std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
-                }
-            }
-        }
-        return;
-    }
+template<typename T, bool isAscending>
+Array<T> sort(const Array<T> &in, const unsigned dim)
+{
+    in.eval();
 
-    ///////////////////////////////////////////////////////////////////////////
-    // Wrapper Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T, bool isAscending>
-    Array<T> sort(const Array<T> &in, const unsigned dim)
-    {
-        Array<T> out = copyArray<T>(in);
-        switch(dim) {
-            case 0: sort0<T, isAscending>(out);
-                    break;
-            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
-        }
-        return out;
+    Array<T> out = copyArray<T>(in);
+    switch(dim) {
+        case 0: getQueue().enqueue(kernel::sort0<T, isAscending>, out); break;
+        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
+    return out;
+}
 
 #define INSTANTIATE(T)                                                  \
     template Array<T> sort<T, true>(const Array<T> &in, const unsigned dim); \
     template Array<T> sort<T,false>(const Array<T> &in, const unsigned dim); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    //INSTANTIATE(cfloat)
-    //INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+//INSTANTIATE(cfloat)
+//INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
 }
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp
index 4b0a092834..5a99257033 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/sort_by_key.cpp
@@ -9,103 +9,31 @@
 
 #include <Array.hpp>
 #include <sort_by_key.hpp>
-#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <algorithm>
-#include <numeric>
-#include <queue>
-#include <future>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
-using std::queue;
-using std::future;
-using std::async;
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/sort_by_key.hpp>
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Kernel Functions
-    ///////////////////////////////////////////////////////////////////////////
-
-    template<typename Tk, typename Tv, bool isAscending>
-    void sort0_by_key(Array<Tk> &okey, Array<Tv> &oval, const Array<Tk> &ikey, const Array<Tv> &ival)
-    {
-        function<bool(Tk, Tk)> op = greater<Tk>();
-        if(isAscending) { op = less<Tk>(); }
-
-        // Get pointers and initialize original index locations
-        Array<uint> oidx = createValueArray(ikey.dims(), 0u);
-            uint *oidx_ptr = oidx.get();
-              Tk *okey_ptr = okey.get();
-              Tv *oval_ptr = oval.get();
-        const Tk *ikey_ptr = ikey.get();
-        const Tv *ival_ptr = ival.get();
-
-        std::vector<uint> seq_vec(oidx.dims()[0]);
-        std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
-        const Tk *comp_ptr = nullptr;
-        auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
-        for(dim_t w = 0; w < ikey.dims()[3]; w++) {
-            dim_t okeyW = w * okey.strides()[3];
-            dim_t ovalW = w * oval.strides()[3];
-            dim_t oidxW = w * oidx.strides()[3];
-            dim_t ikeyW = w * ikey.strides()[3];
-            dim_t ivalW = w * ival.strides()[3];
-
-            for(dim_t z = 0; z < ikey.dims()[2]; z++) {
-                dim_t okeyWZ = okeyW + z * okey.strides()[2];
-                dim_t ovalWZ = ovalW + z * oval.strides()[2];
-                dim_t oidxWZ = oidxW + z * oidx.strides()[2];
-                dim_t ikeyWZ = ikeyW + z * ikey.strides()[2];
-                dim_t ivalWZ = ivalW + z * ival.strides()[2];
-
-                for(dim_t y = 0; y < ikey.dims()[1]; y++) {
-
-                    dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
-                    dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
-                    dim_t oidxOffset = oidxWZ + y * oidx.strides()[1];
-                    dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1];
-                    dim_t ivalOffset = ivalWZ + y * ival.strides()[1];
 
-                    uint *ptr = oidx_ptr + oidxOffset;
-                    std::copy(seq_vec.begin(), seq_vec.end(), ptr);
-
-                    comp_ptr = ikey_ptr + ikeyOffset;
-                    std::stable_sort(ptr, ptr + ikey.dims()[0], comparator);
-
-                    for (dim_t i = 0; i < oval.dims()[0]; ++i){
-                        uint sortIdx = oidx_ptr[oidxOffset + i];
-                        okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx];
-                        oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx];
-                    }
-                }
-            }
-        }
-
-        return;
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Wrapper Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename Tk, typename Tv, bool isAscending>
-    void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
-               const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim)
-    {
-        okey = createEmptyArray<Tk>(ikey.dims());
-        oval = createEmptyArray<Tv>(ival.dims());
-        switch(dim) {
-            case 0: sort0_by_key<Tk, Tv, isAscending>(okey, oval, ikey, ival);
-                    break;
-            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
-        }
+template<typename Tk, typename Tv, bool isAscending>
+void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
+           const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim)
+{
+    ikey.eval();
+    ival.eval();
+
+    okey = createEmptyArray<Tk>(ikey.dims());
+    oval = createEmptyArray<Tv>(ival.dims());
+    Array<uint> oidx = createValueArray(ikey.dims(), 0u);
+    oidx.eval();
+
+    switch(dim) {
+        case 0: getQueue().enqueue(kernel::sort0_by_key<Tk, Tv, isAscending>,
+                                   okey, oval, oidx, ikey, ival); break;
+        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
+}
 
 #define INSTANTIATE(Tk, Tv)                                             \
     template void                                                       \
@@ -128,14 +56,15 @@ namespace cpu
     INSTANTIATE(Tk, uintl)     \
 
 
-    INSTANTIATE1(float)
-    INSTANTIATE1(double)
-    INSTANTIATE1(int)
-    INSTANTIATE1(uint)
-    INSTANTIATE1(char)
-    INSTANTIATE1(uchar)
-    INSTANTIATE1(short)
-    INSTANTIATE1(ushort)
-    INSTANTIATE1(intl)
-    INSTANTIATE1(uintl)
+INSTANTIATE1(float)
+INSTANTIATE1(double)
+INSTANTIATE1(int)
+INSTANTIATE1(uint)
+INSTANTIATE1(char)
+INSTANTIATE1(uchar)
+INSTANTIATE1(short)
+INSTANTIATE1(ushort)
+INSTANTIATE1(intl)
+INSTANTIATE1(uintl)
+
 }
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index eb6b4bee60..77860ede18 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -10,86 +10,27 @@
 #include <Array.hpp>
 #include <sort_index.hpp>
 #include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <algorithm>
 #include <numeric>
-#include <queue>
-#include <future>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
-using std::queue;
-using std::future;
-using std::async;
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/sort_index.hpp>
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Kernel Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T, bool isAscending>
-    void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
-    {
-        // initialize original index locations
-           uint *idx_ptr = idx.get();
-              T *val_ptr = val.get();
-        const T *in_ptr  = in.get();
-        function<bool(T, T)> op = greater<T>();
-        if(isAscending) { op = less<T>(); }
-
-        std::vector<uint> seq_vec(idx.dims()[0]);
-        std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
-        const T *comp_ptr = nullptr;
-        auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
-        for(dim_t w = 0; w < in.dims()[3]; w++) {
-            dim_t valW = w * val.strides()[3];
-            dim_t idxW = w * idx.strides()[3];
-            dim_t  inW = w *  in.strides()[3];
-            for(dim_t z = 0; z < in.dims()[2]; z++) {
-                dim_t valWZ = valW + z * val.strides()[2];
-                dim_t idxWZ = idxW + z * idx.strides()[2];
-                dim_t  inWZ =  inW + z *  in.strides()[2];
-                for(dim_t y = 0; y < in.dims()[1]; y++) {
-
-                    dim_t valOffset = valWZ + y * val.strides()[1];
-                    dim_t idxOffset = idxWZ + y * idx.strides()[1];
-                    dim_t inOffset  =  inWZ + y *  in.strides()[1];
-
-                    uint *ptr = idx_ptr + idxOffset;
-                    std::copy(seq_vec.begin(), seq_vec.end(), ptr);
 
-                    comp_ptr = in_ptr + inOffset;
-                    std::stable_sort(ptr, ptr + in.dims()[0], comparator);
-
-                    for (dim_t i = 0; i < val.dims()[0]; ++i){
-                        val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
-                    }
-                }
-            }
-        }
-
-        return;
-    }
+template<typename T, bool isAscending>
+void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
+{
+    in.eval();
 
-    ///////////////////////////////////////////////////////////////////////////
-    // Wrapper Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T, bool isAscending>
-    void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
-    {
-        val = createEmptyArray<T>(in.dims());
-        idx = createEmptyArray<uint>(in.dims());
-        switch(dim) {
-            case 0: sort0_index<T, isAscending>(val, idx, in);
-                    break;
-            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
-        }
+    val = createEmptyArray<T>(in.dims());
+    idx = createEmptyArray<uint>(in.dims());
+    switch(dim) {
+        case 0: getQueue().enqueue(kernel::sort0_index<T, isAscending>, val, idx, in); break;
+        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
+}
 
 #define INSTANTIATE(T)                                                  \
     template void sort_index<T, true>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
@@ -97,16 +38,17 @@ namespace cpu
     template void sort_index<T,false>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
                                       const uint dim);                  \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    //INSTANTIATE(cfloat)
-    //INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+//INSTANTIATE(cfloat)
+//INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
 }
diff --git a/src/backend/cpu/surface.cpp b/src/backend/cpu/surface.cpp
index 39f375a6fe..24c945c20b 100644
--- a/src/backend/cpu/surface.cpp
+++ b/src/backend/cpu/surface.cpp
@@ -12,37 +12,40 @@
 #include <Array.hpp>
 #include <surface.hpp>
 #include <err_cpu.hpp>
-#include <stdexcept>
 #include <graphics_common.hpp>
-#include <reduce.hpp>
-#include <memory.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<typename T>
-    void copy_surface(const Array<T> &P, fg::Surface* surface)
-    {
-        CheckGL("Before CopyArrayToVBO");
-
-        glBindBuffer(GL_ARRAY_BUFFER, surface->vbo());
-        glBufferSubData(GL_ARRAY_BUFFER, 0, surface->size(), P.get());
-        glBindBuffer(GL_ARRAY_BUFFER, 0);
-
-        CheckGL("In CopyArrayToVBO");
-    }
-
-    #define INSTANTIATE(T)  \
-        template void copy_surface<T>(const Array<T> &P, fg::Surface* surface);
-
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+
+template<typename T>
+void copy_surface(const Array<T> &P, fg::Surface* surface)
+{
+    P.eval();
+    getQueue().sync();
+    CheckGL("Before CopyArrayToVBO");
+
+    glBindBuffer(GL_ARRAY_BUFFER, surface->vbo());
+    glBufferSubData(GL_ARRAY_BUFFER, 0, surface->size(), P.get());
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+    CheckGL("In CopyArrayToVBO");
+}
+
+#define INSTANTIATE(T)  \
+    template void copy_surface<T>(const Array<T> &P, fg::Surface* surface);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
 
 #endif  // WITH_GRAPHICS
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp
index 77493915c0..55a2357206 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/susan.cpp
@@ -11,119 +11,54 @@
 #include <Array.hpp>
 #include <cmath>
 #include <math.hpp>
+#include <memory>
+#include <platform.hpp>
+#include <queue.hpp>
+#include <kernel/susan.hpp>
 
 using af::features;
+using std::shared_ptr;
 
 namespace cpu
 {
 
-template<typename T>
-void susan_responses(T* resp_out, const T* in,
-                     const unsigned idim0, const unsigned idim1,
-                     const int radius, const float t, const float g,
-                     const unsigned border_len)
-{
-    const unsigned r = border_len;
-    const int rSqrd = radius*radius;
-
-    for (unsigned y = r; y < idim1 - r; ++y) {
-        for (unsigned x = r; x < idim0 - r; ++x) {
-            const unsigned idx = y * idim0 + x;
-            T m_0 = in[idx];
-            float nM = 0.0f;
-
-            for (int i=-radius; i<=radius; ++i) {
-                for (int j=-radius; j<=radius; ++j) {
-                    if (i*i + j*j < rSqrd) {
-                        int p = x + i;
-                        int q = y + j;
-                        T m = in[p + idim0 * q];
-                        float exp_pow = std::pow((m - m_0)/t, 6.0);
-                        float cM = std::exp(-exp_pow);
-                        nM += cM;
-                    }
-                }
-            }
-
-            resp_out[idx] = nM < g ? g - nM : T(0);
-        }
-    }
-}
-
-template<typename T>
-void non_maximal(float* x_out, float* y_out, float* resp_out,
-                 unsigned* count, const unsigned idim0, const unsigned idim1,
-                 const T* resp_in, const unsigned border_len, const unsigned max_corners)
-{
-    // Responses on the border don't have 8-neighbors to compare, discard them
-    const unsigned r = border_len + 1;
-
-    for (unsigned y = r; y < idim1 - r; y++) {
-        for (unsigned x = r; x < idim0 - r; x++) {
-            const T v = resp_in[y * idim0 + x];
-
-            // Find maximum neighborhood response
-            T max_v;
-            max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]);
-            max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]);
-            max_v = max(max_v, resp_in[(y-1) * idim0 + x  ]);
-            max_v = max(max_v, resp_in[(y+1) * idim0 + x  ]);
-            max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]);
-            max_v = max(max_v, resp_in[(y)   * idim0 + x+1]);
-            max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]);
-
-            // Stores corner to {x,y,resp}_out if it's response is maximum compared
-            // to its 8-neighborhood and greater or equal minimum response
-            if (v > max_v) {
-                const unsigned idx = *count;
-                *count += 1;
-                if (idx < max_corners) {
-                    x_out[idx]    = (float)x;
-                    y_out[idx]    = (float)y;
-                    resp_out[idx] = (float)v;
-                }
-            }
-        }
-    }
-}
-
 template<typename T>
 unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
                const Array<T> &in,
                const unsigned radius, const float diff_thr, const float geom_thr,
                const float feature_ratio, const unsigned edge)
 {
-    dim4 idims = in.dims();
+    in.eval();
 
+    dim4 idims = in.dims();
     const unsigned corner_lim = in.elements() * feature_ratio;
-    float* x_corners          = memAlloc<float>(corner_lim);
-    float* y_corners          = memAlloc<float>(corner_lim);
-    float* resp_corners       = memAlloc<float>(corner_lim);
-
-    T* resp = memAlloc<T>(in.elements());
-    unsigned corners_found = 0;
 
-    susan_responses<T>(resp, in.get(), idims[0], idims[1], radius, diff_thr, geom_thr, edge);
+    auto x_corners    = createEmptyArray<float>(dim4(corner_lim));
+    auto y_corners    = createEmptyArray<float>(dim4(corner_lim));
+    auto resp_corners = createEmptyArray<float>(dim4(corner_lim));
+    auto response     = createEmptyArray<T>(dim4(in.elements()));
+    auto corners_found= std::shared_ptr<unsigned>(memAlloc<unsigned>(1), memFree<unsigned>);
+    corners_found.get()[0] = 0;
 
-    non_maximal<T>(x_corners, y_corners, resp_corners, &corners_found,
-                   idims[0], idims[1], resp, edge, corner_lim);
+    getQueue().enqueue(kernel::susan_responses<T>, response, in, idims[0], idims[1],
+                       radius, diff_thr, geom_thr, edge);
+    getQueue().enqueue(kernel::non_maximal<T>, x_corners, y_corners, resp_corners, corners_found,
+                       idims[0], idims[1], response, edge, corner_lim);
+    getQueue().sync();
 
-    memFree(resp);
-
-    const unsigned corners_out = min(corners_found, corner_lim);
+    const unsigned corners_out = min((corners_found.get())[0], corner_lim);
     if (corners_out == 0) {
-        memFree(x_corners);
-        memFree(y_corners);
-        memFree(resp_corners);
         x_out    = createEmptyArray<float>(dim4());
         y_out    = createEmptyArray<float>(dim4());
         resp_out = createEmptyArray<float>(dim4());
         return 0;
     } else {
-
-        x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)x_corners);
-        y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)y_corners);
-        resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)resp_corners);
+        x_out = x_corners;
+        y_out = y_corners;
+        resp_out = resp_corners;
+        x_out.resetDims(dim4(corners_out));
+        y_out.resetDims(dim4(corners_out));
+        resp_out.resetDims(dim4(corners_out));
         return corners_out;
     }
 }
diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp
index 461b9014aa..2ac58aab3f 100644
--- a/src/backend/cpu/svd.cpp
+++ b/src/backend/cpu/svd.cpp
@@ -10,12 +10,13 @@
 #include <Array.hpp>
 #include <svd.hpp>
 #include <err_common.hpp>
-
 #include <err_cpu.hpp>
 
 #if defined(WITH_CPU_LINEAR_ALGEBRA)
 #include <lapack_helper.hpp>
 #include <copy.hpp>
+#include <platform.hpp>
+#include <queue.hpp>
 
 namespace cpu
 {
@@ -29,93 +30,106 @@ namespace cpu
 
 #if defined(USE_MKL) || defined(__APPLE__)
 
-    template<typename T, typename Tr>
-    using svd_func_def = int (*)(ORDER_TYPE,
-                                 char jobz,
-                                 int m, int n,
-                                 T* in, int ldin,
-                                 Tr* s,
-                                 T* u, int ldu,
-                                 T* vt, int ldvt);
-
-    SVD_FUNC_DEF( gesdd )
-    SVD_FUNC(gesdd, float  , float , s)
-    SVD_FUNC(gesdd, double , double, d)
-    SVD_FUNC(gesdd, cfloat , float , c)
-    SVD_FUNC(gesdd, cdouble, double, z)
+template<typename T, typename Tr>
+using svd_func_def = int (*)(ORDER_TYPE,
+                             char jobz,
+                             int m, int n,
+                             T* in, int ldin,
+                             Tr* s,
+                             T* u, int ldu,
+                             T* vt, int ldvt);
+
+SVD_FUNC_DEF( gesdd )
+SVD_FUNC(gesdd, float  , float , s)
+SVD_FUNC(gesdd, double , double, d)
+SVD_FUNC(gesdd, cfloat , float , c)
+SVD_FUNC(gesdd, cdouble, double, z)
 
 #else   // Atlas causes memory freeing issues with using gesdd
 
-    template<typename T, typename Tr>
-    using svd_func_def = int (*)(ORDER_TYPE,
-                                 char jobu, char jobvt,
-                                 int m, int n,
-                                 T* in, int ldin,
-                                 Tr* s,
-                                 T* u, int ldu,
-                                 T* vt, int ldvt,
-                                 Tr *superb);
-
-    SVD_FUNC_DEF( gesvd )
-    SVD_FUNC(gesvd, float  , float , s)
-    SVD_FUNC(gesvd, double , double, d)
-    SVD_FUNC(gesvd, cfloat , float , c)
-    SVD_FUNC(gesvd, cdouble, double, z)
+template<typename T, typename Tr>
+using svd_func_def = int (*)(ORDER_TYPE,
+                             char jobu, char jobvt,
+                             int m, int n,
+                             T* in, int ldin,
+                             Tr* s,
+                             T* u, int ldu,
+                             T* vt, int ldvt,
+                             Tr *superb);
+
+SVD_FUNC_DEF( gesvd )
+SVD_FUNC(gesvd, float  , float , s)
+SVD_FUNC(gesvd, double , double, d)
+SVD_FUNC(gesvd, cfloat , float , c)
+SVD_FUNC(gesvd, cdouble, double, z)
 
 #endif
 
-    template <typename T, typename Tr>
-    void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
-    {
+template <typename T, typename Tr>
+void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+{
+    s.eval();
+    u.eval();
+    vt.eval();
+    in.eval();
+
+    auto func = [=] (Array<Tr> s, Array<T> u, Array<T> vt, Array<T> in) {
         dim4 iDims = in.dims();
         int M = iDims[0];
         int N = iDims[1];
 
 #if defined(USE_MKL) || defined(__APPLE__)
         svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1],
-                          s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]);
+                s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]);
 #else
         std::vector<Tr> superb(std::min(M, N));
         svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1],
-                          s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]);
+                s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]);
 #endif
-    }
-
-    template <typename T, typename Tr>
-    void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
-    {
-        Array<T> in_copy = copyArray<T>(in);
-        svdInPlace(s, u, vt, in_copy);
-    }
+    };
+    getQueue().enqueue(func, s, u, vt, in);
+}
+
+template <typename T, typename Tr>
+void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+{
+    Array<T> in_copy = copyArray<T>(in);
+    svdInPlace(s, u, vt, in_copy);
+}
+
 }
 
 #else
 
 namespace cpu
 {
-    template <typename T, typename Tr>
-    void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
-    {
-        AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
-    }
-
-    template <typename T, typename Tr>
-    void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
-    {
-        AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
-    }
+
+template <typename T, typename Tr>
+void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+{
+    AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
+}
+
+template <typename T, typename Tr>
+void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+{
+    AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
+}
+
 }
 
 #endif
 
-namespace cpu {
+namespace cpu
+{
 
 #define INSTANTIATE_SVD(T, Tr)                                          \
     template void svd<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, const Array<T> &in); \
     template void svdInPlace<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, Array<T> &in);
 
-    INSTANTIATE_SVD(float  , float )
-    INSTANTIATE_SVD(double , double)
-    INSTANTIATE_SVD(cfloat , float )
-    INSTANTIATE_SVD(cdouble, double)
+INSTANTIATE_SVD(float  , float )
+INSTANTIATE_SVD(double , double)
+INSTANTIATE_SVD(cfloat , float )
+INSTANTIATE_SVD(cdouble, double)
+
 }
diff --git a/src/backend/cpu/threads b/src/backend/cpu/threads
new file mode 160000
index 0000000000..5e778ce0a7
--- /dev/null
+++ b/src/backend/cpu/threads
@@ -0,0 +1 @@
+Subproject commit 5e778ce0a7f0f80af9d32ea3569df3dbec834f59
diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp
index 77e72afd09..0fe52c6398 100644
--- a/src/backend/cpu/tile.cpp
+++ b/src/backend/cpu/tile.cpp
@@ -9,69 +9,46 @@
 
 #include <Array.hpp>
 #include <tile.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
+#include <platform.hpp>
+#include <kernel/tile.hpp>
 
 namespace cpu
 {
-    template<typename T>
-    Array<T> tile(const Array<T> &in, const af::dim4 &tileDims)
-    {
-        const af::dim4 iDims = in.dims();
-        af::dim4 oDims = iDims;
-        oDims *= tileDims;
 
-        if(iDims.elements() == 0 || oDims.elements() == 0) {
-            throw std::runtime_error("Elements are 0");
-        }
+template<typename T>
+Array<T> tile(const Array<T> &in, const af::dim4 &tileDims)
+{
+    in.eval();
 
-        Array<T> out = createEmptyArray<T>(oDims);
+    const af::dim4 iDims = in.dims();
+    af::dim4 oDims = iDims;
+    oDims *= tileDims;
 
-        T* outPtr = out.get();
-        const T* inPtr = in.get();
+    if(iDims.elements() == 0 || oDims.elements() == 0) {
+        throw std::runtime_error("Elements are 0");
+    }
 
-        const af::dim4 ist = in.strides();
-        const af::dim4 ost = out.strides();
+    Array<T> out = createEmptyArray<T>(oDims);
 
-        for(dim_t ow = 0; ow < oDims[3]; ow++) {
-            const dim_t iw = ow % iDims[3];
-            const dim_t iW = iw * ist[3];
-            const dim_t oW = ow * ost[3];
-            for(dim_t oz = 0; oz < oDims[2]; oz++) {
-                const dim_t iz = oz % iDims[2];
-                const dim_t iZW = iW + iz * ist[2];
-                const dim_t oZW = oW + oz * ost[2];
-                for(dim_t oy = 0; oy < oDims[1]; oy++) {
-                    const dim_t iy = oy % iDims[1];
-                    const dim_t iYZW = iZW + iy * ist[1];
-                    const dim_t oYZW = oZW + oy * ost[1];
-                    for(dim_t ox = 0; ox < oDims[0]; ox++) {
-                        const dim_t ix = ox % iDims[0];
-                        const dim_t iMem = iYZW + ix;
-                        const dim_t oMem = oYZW + ox;
-                        outPtr[oMem] = inPtr[iMem];
-                    }
-                }
-            }
-        }
+    getQueue().enqueue(kernel::tile<T>, out, in);
 
-        return out;
-    }
+    return out;
+}
 
 #define INSTANTIATE(T)                                                         \
     template Array<T> tile<T>(const Array<T> &in, const af::dim4 &tileDims);  \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp
index 68e8d96eba..3a76fb2f24 100644
--- a/src/backend/cpu/transform.cpp
+++ b/src/backend/cpu/transform.cpp
@@ -10,138 +10,59 @@
 #include <Array.hpp>
 #include <transform.hpp>
 #include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
+#include <platform.hpp>
 #include "transform_interp.hpp"
+#include <kernel/transform.hpp>
 
 namespace cpu
 {
-    template <typename T>
-    void calc_affine_inverse(T *txo, const T *txi)
-    {
-        T det = txi[0]*txi[4] - txi[1]*txi[3];
 
-        txo[0] = txi[4] / det;
-        txo[1] = txi[3] / det;
-        txo[3] = txi[1] / det;
-        txo[4] = txi[0] / det;
-
-        txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1];
-        txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4];
-    }
-
-    template <typename T>
-    void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse)
-    {
-        // The way kernel is structured, it expects an inverse
-        // transform matrix by default.
-        // If it is an forward transform, then we need its inverse
-        if(inverse) {
-            for(int i = 0; i < 6; i++)
-                tmat[i] = tmat_ptr[i];
-        } else {
-            calc_affine_inverse(tmat, tmat_ptr);
-        }
-    }
-
-    template<typename T, af_interp_type method>
-    void transform_(T *out, const T *in, const float *tf,
-                    const af::dim4 &odims, const af::dim4 &idims,
-                    const af::dim4 &ostrides, const af::dim4 &istrides,
-                    const af::dim4 &tstrides, const bool inverse)
-    {
-        dim_t nimages     = idims[2];
-        // Multiplied in src/backend/transform.cpp
-        dim_t ntransforms = odims[2] / idims[2];
-
-        void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
-                     const af::dim4 &, const af::dim4 &,
-                     const dim_t, const dim_t, const dim_t, const dim_t);
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                t_fn = &transform_n;
-                break;
-            case AF_INTERP_BILINEAR:
-                t_fn = &transform_b;
-                break;
-            case AF_INTERP_LOWER:
-                t_fn = &transform_l;
-                break;
-            default:
-                AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-                break;
-        }
-
-
-        // For each transform channel
-        for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) {
-            // Compute inverse if required
-            const float *tmat_ptr = tf + t_idx * 6;
-            float tmat[6];
-            calc_affine_inverse(tmat, tmat_ptr, inverse);
-
-            // Offset for output pointer
-            dim_t o_offset = t_idx * nimages * ostrides[2];
-
-            // Do transform for image
-            for(int yy = 0; yy < (int)odims[1]; yy++) {
-                for(int xx = 0; xx < (int)odims[0]; xx++) {
-                    t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy);
-                }
-            }
-        }
+template<typename T>
+Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
+                    const af_interp_type method, const bool inverse, const bool perspective)
+{
+    in.eval();
+    transform.eval();
+
+    Array<T> out = createEmptyArray<T>(odims);
+
+    switch(method) {
+        case AF_INTERP_NEAREST :
+            getQueue().enqueue(kernel::transform<T, AF_INTERP_NEAREST >, out, in, transform,
+                    inverse, perspective);
+            break;
+        case AF_INTERP_BILINEAR:
+            getQueue().enqueue(kernel::transform<T, AF_INTERP_BILINEAR>, out, in, transform,
+                    inverse, perspective);
+            break;
+        case AF_INTERP_LOWER   :
+            getQueue().enqueue(kernel::transform<T, AF_INTERP_LOWER   >, out, in, transform,
+                    inverse, perspective);
+            break;
+        default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break;
     }
 
-    template<typename T>
-    Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
-                        const af_interp_type method, const bool inverse)
-    {
-        const af::dim4 idims = in.dims();
-
-        Array<T> out = createEmptyArray<T>(odims);
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                transform_<T, AF_INTERP_NEAREST>
-                          (out.get(), in.get(), transform.get(), odims, idims,
-                           out.strides(), in.strides(), transform.strides(), inverse);
-                break;
-            case AF_INTERP_BILINEAR:
-                transform_<T, AF_INTERP_BILINEAR>
-                          (out.get(), in.get(), transform.get(), odims, idims,
-                           out.strides(), in.strides(), transform.strides(), inverse);
-                break;
-            case AF_INTERP_LOWER:
-                transform_<T, AF_INTERP_LOWER>
-                          (out.get(), in.get(), transform.get(), odims, idims,
-                           out.strides(), in.strides(), transform.strides(), inverse);
-                break;
-            default:
-                AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-                break;
-        }
+    return out;
+}
 
-        return out;
-    }
 
+#define INSTANTIATE(T)                                                              \
+template Array<T> transform(const Array<T> &in, const Array<float> &transform,      \
+                            const af::dim4 &odims, const af_interp_type method,     \
+                            const bool inverse, const bool perspective);
 
-#define INSTANTIATE(T)                                                                  \
-    template Array<T> transform(const Array<T> &in, const Array<float> &transform,      \
-                                const af::dim4 &odims, const af_interp_type method,     \
-                                const bool inverse);
 
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cpu/transform.hpp b/src/backend/cpu/transform.hpp
index f9e730b1d4..ad4ebba5c3 100644
--- a/src/backend/cpu/transform.hpp
+++ b/src/backend/cpu/transform.hpp
@@ -14,5 +14,5 @@ namespace cpu
 {
     template<typename T>
     Array<T> transform(const Array<T> &in, const Array<float> &tf, const af::dim4 &odims,
-                        const af_interp_type method, const bool inverse);
+                        const af_interp_type method, const bool inverse, const bool perspective);
 }
diff --git a/src/backend/cpu/transform_interp.hpp b/src/backend/cpu/transform_interp.hpp
index 5ad47507b2..d8b9ee2a06 100644
--- a/src/backend/cpu/transform_interp.hpp
+++ b/src/backend/cpu/transform_interp.hpp
@@ -7,6 +7,8 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <math.hpp>
 #include <types.hpp>
 #include <af/traits.hpp>
 
@@ -27,15 +29,27 @@ namespace cpu
     void transform_n(T *out, const T *in, const float *tmat, const af::dim4 &idims,
                       const af::dim4 &ostrides, const af::dim4 &istrides,
                       const dim_t nimages, const dim_t o_offset,
-                      const dim_t xx, const dim_t yy)
+                      const dim_t xx, const dim_t yy, const bool perspective)
     {
+        dim_t yi = 0, xi = 0;
         // Compute output index
-        const dim_t xi = round(xx * tmat[0]
-                             + yy * tmat[1]
-                                  + tmat[2]);
-        const dim_t yi = round(xx * tmat[3]
-                             + yy * tmat[4]
-                                  + tmat[5]);
+        if (perspective) {
+            const float W = xx * tmat[6] + yy * tmat[7] + tmat[8];
+            xi = round((xx * tmat[0]
+                      + yy * tmat[1]
+                           + tmat[2]) / W);
+            yi = round((xx * tmat[3]
+                      + yy * tmat[4]
+                           + tmat[5]) / W);
+        }
+        else {
+            xi = round(xx * tmat[0]
+                     + yy * tmat[1]
+                          + tmat[2]);
+            yi = round(xx * tmat[3]
+                     + yy * tmat[4]
+                          + tmat[5]);
+        }
 
         // Compute memory location of indices
         dim_t loci = (yi * istrides[1] + xi);
@@ -62,16 +76,28 @@ namespace cpu
     void transform_b(T *out, const T *in, const float *tmat, const af::dim4 &idims,
                       const af::dim4 &ostrides, const af::dim4 &istrides,
                       const dim_t nimages, const dim_t o_offset,
-                      const dim_t xx, const dim_t yy)
+                      const dim_t xx, const dim_t yy, const bool perspective)
     {
         dim_t loco = (yy * ostrides[1] + xx);
         // Compute input index
-        const float xi = xx * tmat[0]
-                       + yy * tmat[1]
-                            + tmat[2];
-        const float yi = xx * tmat[3]
-                       + yy * tmat[4]
-                            + tmat[5];
+        float xi = 0.0f, yi = 0.0f;
+        if (perspective) {
+            const float W = xx * tmat[6] + yy * tmat[7] + tmat[8];
+            xi = (xx * tmat[0]
+                + yy * tmat[1]
+                     + tmat[2]) / W;
+            yi = (xx * tmat[3]
+                + yy * tmat[4]
+                     + tmat[5]) / W;
+        }
+        else {
+            xi = xx * tmat[0]
+               + yy * tmat[1]
+                    + tmat[2];
+            yi = xx * tmat[3]
+               + yy * tmat[4]
+                    + tmat[5];
+        }
 
         if (xi < -0.0001 || yi < -0.0001 || idims[0] < xi || idims[1] < yi) {
             for(int i_idx = 0; i_idx < (int)nimages; i_idx++) {
@@ -126,15 +152,27 @@ namespace cpu
     void transform_l(T *out, const T *in, const float *tmat, const af::dim4 &idims,
                       const af::dim4 &ostrides, const af::dim4 &istrides,
                       const dim_t nimages, const dim_t o_offset,
-                      const dim_t xx, const dim_t yy)
+                      const dim_t xx, const dim_t yy, const bool perspective)
     {
         // Compute output index
-        const dim_t xi = floor(xx * tmat[0]
-                             + yy * tmat[1]
-                                  + tmat[2]);
-        const dim_t yi = floor(xx * tmat[3]
-                             + yy * tmat[4]
-                                  + tmat[5]);
+        dim_t xi = 0, yi = 0;
+        if (perspective) {
+            const float W = xx * tmat[6] + yy * tmat[7] + tmat[8];
+            xi = floor((xx * tmat[0]
+                      + yy * tmat[1]
+                           + tmat[2]) / W);
+            yi = floor((xx * tmat[3]
+                      + yy * tmat[4]
+                           + tmat[5]) / W);
+        }
+        else {
+            xi = floor(xx * tmat[0]
+                     + yy * tmat[1]
+                          + tmat[2]);
+            yi = floor(xx * tmat[3]
+                     + yy * tmat[4]
+                          + tmat[5]);
+        }
 
         // Compute memory location of indices
         dim_t loci = (yi * istrides[1] + xi);
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp
index bea0aa0d6f..a6d410757b 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/transpose.cpp
@@ -12,7 +12,8 @@
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <transpose.hpp>
-
+#include <platform.hpp>
+#include <kernel/transpose.hpp>
 #include <utility>
 #include <cassert>
 
@@ -21,128 +22,26 @@ using af::dim4;
 namespace cpu
 {
 
-static inline unsigned getIdx(const dim4 &strides,
-        int i, int j = 0, int k = 0, int l = 0)
-{
-    return (l * strides[3] +
-            k * strides[2] +
-            j * strides[1] +
-            i );
-}
-
-template<typename T>
-T getConjugate(const T &in)
-{
-    // For non-complex types return same
-    return in;
-}
-
-template<>
-cfloat getConjugate(const cfloat &in)
-{
-    return std::conj(in);
-}
-
-template<>
-cdouble getConjugate(const cdouble &in)
-{
-    return std::conj(in);
-}
-
-template<typename T, bool conjugate>
-void transpose_(T *out, const T *in, const af::dim4 &odims, const af::dim4 &idims,
-                const af::dim4 &ostrides, const af::dim4 &istrides)
-{
-    for (dim_t l = 0; l < odims[3]; ++l) {
-        for (dim_t k = 0; k < odims[2]; ++k) {
-            // Outermost loop handles batch mode
-            // if input has no data along third dimension
-            // this loop runs only once
-            for (dim_t j = 0; j < odims[1]; ++j) {
-                for (dim_t i = 0; i < odims[0]; ++i) {
-                    // calculate array indices based on offsets and strides
-                    // the helper getIdx takes care of indices
-                    const dim_t inIdx  = getIdx(istrides,j,i,k,l);
-                    const dim_t outIdx = getIdx(ostrides,i,j,k,l);
-                    if(conjugate)
-                        out[outIdx] = getConjugate(in[inIdx]);
-                    else
-                        out[outIdx] = in[inIdx];
-                }
-            }
-            // outData and inData pointers doesn't need to be
-            // offset as the getIdx function is taking care
-            // of the batch parameter
-        }
-    }
-}
-
 template<typename T>
 Array<T> transpose(const Array<T> &in, const bool conjugate)
 {
-    const dim4 inDims = in.dims();
-
-    dim4 outDims   = dim4(inDims[1],inDims[0],inDims[2],inDims[3]);
+    in.eval();
 
+    const dim4 inDims  = in.dims();
+    const dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]);
     // create an array with first two dimensions swapped
     Array<T> out  = createEmptyArray<T>(outDims);
 
-    // get data pointers for input and output Arrays
-    T* outData          = out.get();
-    const T*   inData   = in.get();
-
-    if(conjugate) {
-        transpose_<T, true>(outData, inData,
-                            out.dims(), in.dims(), out.strides(), in.strides());
-    } else {
-        transpose_<T, false>(outData, inData,
-                             out.dims(), in.dims(), out.strides(), in.strides());
-    }
+    getQueue().enqueue(kernel::transpose<T>, out, in, conjugate);
 
     return out;
 }
 
-template<typename T, bool conjugate>
-void transpose_inplace(T *in, const af::dim4 &idims, const af::dim4 &istrides)
-{
-    for (dim_t l = 0; l < idims[3]; ++l) {
-        for (dim_t k = 0; k < idims[2]; ++k) {
-            // Outermost loop handles batch mode
-            // if input has no data along third dimension
-            // this loop runs only once
-            //
-            // Run only bottom triangle. std::swap swaps with upper triangle
-            for (dim_t j = 0; j < idims[1]; ++j) {
-                for (dim_t i = j + 1; i < idims[0]; ++i) {
-                    // calculate array indices based on offsets and strides
-                    // the helper getIdx takes care of indices
-                    const dim_t iIdx  = getIdx(istrides,j,i,k,l);
-                    const dim_t oIdx = getIdx(istrides,i,j,k,l);
-                    if(conjugate) {
-                        in[iIdx] = getConjugate(in[iIdx]);
-                        in[oIdx] = getConjugate(in[oIdx]);
-                        std::swap(in[iIdx], in[oIdx]);
-                    }
-                    else {
-                        std::swap(in[iIdx], in[oIdx]);
-                    }
-                }
-            }
-        }
-    }
-}
-
 template<typename T>
 void transpose_inplace(Array<T> &in, const bool conjugate)
 {
-    // get data pointers for input and output Arrays
-    T* inData = in.get();
-
-    if(conjugate) {
-        transpose_inplace<T, true >(inData, in.dims(), in.strides());
-    } else {
-        transpose_inplace<T, false>(inData, in.dims(), in.strides());
-    }
+    in.eval();
+    getQueue().enqueue(kernel::transpose_inplace<T>, in, conjugate);
 }
 
 #define INSTANTIATE(T)                                                      \
@@ -162,5 +61,4 @@ INSTANTIATE(uintl  )
 INSTANTIATE(short)
 INSTANTIATE(ushort)
 
-
 }
diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp
index 6b0f326aad..eaad1b9f86 100644
--- a/src/backend/cpu/triangle.cpp
+++ b/src/backend/cpu/triangle.cpp
@@ -12,6 +12,8 @@
 #include <Array.hpp>
 #include <triangle.hpp>
 #include <math.hpp>
+#include <platform.hpp>
+#include <kernel/triangle.hpp>
 
 namespace cpu
 {
@@ -19,47 +21,14 @@ namespace cpu
 template<typename T, bool is_upper, bool is_unit_diag>
 void triangle(Array<T> &out, const Array<T> &in)
 {
-    T *o = out.get();
-    const T *i = in.get();
-
-    dim4 odm = out.dims();
-
-    dim4 ost = out.strides();
-    dim4 ist = in.strides();
-
-    for(dim_t ow = 0; ow < odm[3]; ow++) {
-        const dim_t oW = ow * ost[3];
-        const dim_t iW = ow * ist[3];
-
-        for(dim_t oz = 0; oz < odm[2]; oz++) {
-            const dim_t oZW = oW + oz * ost[2];
-            const dim_t iZW = iW + oz * ist[2];
-
-            for(dim_t oy = 0; oy < odm[1]; oy++) {
-                const dim_t oYZW = oZW + oy * ost[1];
-                const dim_t iYZW = iZW + oy * ist[1];
-
-                for(dim_t ox = 0; ox < odm[0]; ox++) {
-                    const dim_t oMem = oYZW + ox;
-                    const dim_t iMem = iYZW + ox;
-
-                    bool cond = is_upper ? (oy >= ox) : (oy <= ox);
-                    bool do_unit_diag = (is_unit_diag && ox == oy);
-                    if(cond) {
-                        o[oMem] = do_unit_diag ? scalar<T>(1) : i[iMem];
-                    } else {
-                        o[oMem] = scalar<T>(0);
-                    }
-
-                }
-            }
-        }
-    }
+    in.eval();
+    getQueue().enqueue(kernel::triangle<T, is_upper, is_unit_diag>, out, in);
 }
 
 template<typename T, bool is_upper, bool is_unit_diag>
 Array<T> triangle(const Array<T> &in)
 {
+    in.eval();
     Array<T> out = createEmptyArray<T>(in.dims());
     triangle<T, is_upper, is_unit_diag>(out, in);
     return out;
@@ -75,17 +44,17 @@ Array<T> triangle(const Array<T> &in)
     template Array<T> triangle<T, true , false>(const Array<T> &in);    \
     template Array<T> triangle<T, false, false>(const Array<T> &in);    \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index f9c25f9a9e..d19286f496 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -9,116 +9,58 @@
 
 #include <Array.hpp>
 #include <unwrap.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <dispatch.hpp>
 #include <math.hpp>
+#include <platform.hpp>
+#include <kernel/unwrap.hpp>
 
 namespace cpu
 {
-    template<typename T, int d>
-    void unwrap_dim(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-                    const af::dim4 &ostrides, const af::dim4 &istrides,
-                    const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
-                    const dim_t px, const dim_t py)
-    {
-        dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
 
-        for(dim_t w = 0; w < odims[3]; w++) {
-            for(dim_t z = 0; z < odims[2]; z++) {
-
-                dim_t cOut = w * ostrides[3] + z * ostrides[2];
-                dim_t cIn  = w * istrides[3] + z * istrides[2];
-                const T* iptr = inPtr  + cIn;
-                T* optr_= outPtr + cOut;
-
-                for(dim_t col = 0; col < odims[d]; col++) {
-                    // Offset output ptr
-                    T* optr = optr_ + col * ostrides[d];
-
-                    // Calculate input window index
-                    dim_t winy = (col / nx);
-                    dim_t winx = (col % nx);
-
-                    dim_t startx = winx * sx;
-                    dim_t starty = winy * sy;
-
-                    dim_t spx = startx - px;
-                    dim_t spy = starty - py;
-
-                    // Short cut condition ensuring all values within input dimensions
-                    bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]);
+template<typename T>
+Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
+                const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
+{
+    in.eval();
 
-                    for(dim_t y = 0; y < wy; y++) {
-                        for(dim_t x = 0; x < wx; x++) {
-                            dim_t xpad = spx + x;
-                            dim_t ypad = spy + y;
+    af::dim4 idims = in.dims();
+    dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
+    dim_t ny = (idims[1] + 2 * py - wy) / sy + 1;
 
-                            dim_t oloc = (y * wx + x);
-                            if (d == 0) oloc *= ostrides[1];
+    af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
 
-                            if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) {
-                                dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]);
-                                optr[oloc] = iptr[iloc];
-                            } else {
-                                optr[oloc] = scalar<T>(0.0);
-                            }
-                        }
-                    }
-                }
-            }
-        }
+    if (!is_column) {
+        std::swap(odims[0], odims[1]);
     }
 
-    template<typename T>
-    Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
-    {
-        af::dim4 idims = in.dims();
-
-        dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
-        dim_t ny = (idims[1] + 2 * py - wy) / sy + 1;
+    Array<T> outArray = createEmptyArray<T>(odims);
 
-        af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
-
-        if (!is_column) {
-            std::swap(odims[0], odims[1]);
-        }
-
-        // Create output placeholder
-        Array<T> outArray = createEmptyArray<T>(odims);
-
-        // Get pointers to raw data
-        const T *inPtr = in.get();
-        T *outPtr = outArray.get();
-
-        af::dim4 ostrides = outArray.strides();
-        af::dim4 istrides = in.strides();
-
-        if (is_column) {
-            unwrap_dim<T, 1>(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
-        } else {
-            unwrap_dim<T, 0>(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
-        }
-        return outArray;
+    if (is_column) {
+        getQueue().enqueue(kernel::unwrap_dim<T, 1>, outArray, in, wx, wy, sx, sy, px, py);
+    } else {
+        getQueue().enqueue(kernel::unwrap_dim<T, 0>, outArray, in, wx, wy, sx, sy, px, py);
     }
 
+    return outArray;
+}
+
 
 #define INSTANTIATE(T)                                                                  \
     template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy,    \
                     const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
 
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/utility.hpp b/src/backend/cpu/utility.hpp
new file mode 100644
index 0000000000..68cef5a440
--- /dev/null
+++ b/src/backend/cpu/utility.hpp
@@ -0,0 +1,63 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <af/constants.h>
+#include <cmath>
+#include <algorithm>
+#include "backend.hpp"
+
+namespace cpu
+{
+
+static inline
+dim_t trimIndex(int const & idx, dim_t const & len)
+{
+    int ret_val = idx;
+    int offset  = abs(ret_val)%len;
+    if (ret_val<0) {
+        ret_val = offset-1;
+    } else if (ret_val>=(int)len) {
+        ret_val = len-offset-1;
+    }
+    return ret_val;
+}
+
+static inline
+dim_t clamp(dim_t a, dim_t mn, dim_t mx)
+{
+    return (a<mn ? mn : (a>mx ? mx : a));
+}
+
+static inline
+unsigned getIdx(af::dim4 const & strides, int i, int j = 0, int k = 0, int l = 0)
+{
+    return (l * strides[3] + k * strides[2] + j * strides[1] + i * strides[0]);
+}
+
+template<typename T>
+void gaussian1D(T* out, int const dim, double sigma=0.0)
+{
+    if(!(sigma>0)) sigma = 0.25*dim;
+
+    T sum = (T)0;
+    for(int i=0;i<dim;i++)
+    {
+        int x = i-(dim-1)/2;
+        T el = 1. / std::sqrt(2 * af::Pi * sigma*sigma) * std::exp(-((x*x)/(2*(sigma*sigma))));
+        out[i] = el;
+        sum   += el;
+    }
+
+    for(int k=0;k<dim;k++)
+        out[k] /= sum;
+}
+
+}
diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp
index 6c0f8c7acc..249327163d 100644
--- a/src/backend/cpu/where.cpp
+++ b/src/backend/cpu/where.cpp
@@ -16,63 +16,68 @@
 #include <where.hpp>
 #include <ops.hpp>
 #include <vector>
+#include <platform.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
-    template<typename T>
-    Array<uint> where(const Array<T> &in)
-    {
-        const dim_t *dims    = in.dims().get();
-        const dim_t *strides = in.strides().get();
-        static const T zero = scalar<T>(0);
 
-        const T *iptr = in.get();
-        uint *out_vec  = memAlloc<uint>(in.elements());
+template<typename T>
+Array<uint> where(const Array<T> &in)
+{
+    in.eval();
+    getQueue().sync();
+
+    const dim_t *dims    = in.dims().get();
+    const dim_t *strides = in.strides().get();
+    static const T zero = scalar<T>(0);
+
+    const T *iptr = in.get();
+    uint *out_vec  = memAlloc<uint>(in.elements());
 
-        dim_t count = 0;
-        dim_t idx = 0;
-        for (dim_t w = 0; w < dims[3]; w++) {
-            uint offw = w * strides[3];
+    dim_t count = 0;
+    dim_t idx = 0;
+    for (dim_t w = 0; w < dims[3]; w++) {
+        uint offw = w * strides[3];
 
-            for (dim_t z = 0; z < dims[2]; z++) {
-                uint offz = offw + z * strides[2];
+        for (dim_t z = 0; z < dims[2]; z++) {
+            uint offz = offw + z * strides[2];
 
-                for (dim_t y = 0; y < dims[1]; y++) {
-                    uint offy = y * strides[1] + offz;
+            for (dim_t y = 0; y < dims[1]; y++) {
+                uint offy = y * strides[1] + offz;
 
-                    for (dim_t x = 0; x < dims[0]; x++) {
+                for (dim_t x = 0; x < dims[0]; x++) {
 
-                        T val = iptr[offy + x];
-                        if (val != zero) {
-                            out_vec[count] = idx;
-                            count++;
-                        }
-                        idx++;
+                    T val = iptr[offy + x];
+                    if (val != zero) {
+                        out_vec[count] = idx;
+                        count++;
                     }
+                    idx++;
                 }
             }
         }
-
-        Array<uint> out = createDeviceDataArray<uint>(dim4(count), out_vec);
-        return out;
     }
 
+    Array<uint> out = createDeviceDataArray<uint>(dim4(count), out_vec);
+    return out;
+}
+
 #define INSTANTIATE(T)                                  \
     template Array<uint> where<T>(const Array<T> &in);    \
 
-    INSTANTIATE(float  )
-    INSTANTIATE(cfloat )
-    INSTANTIATE(double )
-    INSTANTIATE(cdouble)
-    INSTANTIATE(char   )
-    INSTANTIATE(int    )
-    INSTANTIATE(uint   )
-    INSTANTIATE(intl   )
-    INSTANTIATE(uintl  )
-    INSTANTIATE(uchar  )
-    INSTANTIATE(short  )
-    INSTANTIATE(ushort )
+INSTANTIATE(float  )
+INSTANTIATE(cfloat )
+INSTANTIATE(double )
+INSTANTIATE(cdouble)
+INSTANTIATE(char   )
+INSTANTIATE(int    )
+INSTANTIATE(uint   )
+INSTANTIATE(intl   )
+INSTANTIATE(uintl  )
+INSTANTIATE(uchar  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp
index a04a6f5250..8e0f6fe2f7 100644
--- a/src/backend/cpu/wrap.cpp
+++ b/src/backend/cpu/wrap.cpp
@@ -9,95 +9,37 @@
 
 #include <Array.hpp>
 #include <wrap.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <dispatch.hpp>
 #include <math.hpp>
+#include <platform.hpp>
+#include <kernel/wrap.hpp>
 
 namespace cpu
 {
 
-    template<typename T, int d>
-    void wrap_dim(T *outPtr, const T *inPtr,
-                  const af::dim4 &odims, const af::dim4 &idims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                  const dim_t wx, const dim_t wy,
-                  const dim_t sx, const dim_t sy,
-                  const dim_t px, const dim_t py)
-    {
-        dim_t nx = (odims[0] + 2 * px - wx) / sx + 1;
-
-        for(dim_t w = 0; w < idims[3]; w++) {
-            for(dim_t z = 0; z < idims[2]; z++) {
-
-                dim_t cIn  = w * istrides[3] + z * istrides[2];
-                dim_t cOut = w * ostrides[3] + z * ostrides[2];
-                const T* iptr_ = inPtr  + cIn;
-                T* optr= outPtr + cOut;
-
-                for(dim_t col = 0; col < idims[d]; col++) {
-                    // Offset output ptr
-                    const T* iptr = iptr_ + col * istrides[d];
-
-                    // Calculate input window index
-                    dim_t winy = (col / nx);
-                    dim_t winx = (col % nx);
-
-                    dim_t startx = winx * sx;
-                    dim_t starty = winy * sy;
-
-                    dim_t spx = startx - px;
-                    dim_t spy = starty - py;
-
-                    // Short cut condition ensuring all values within input dimensions
-                    bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]);
-
-                    for(dim_t y = 0; y < wy; y++) {
-                        for(dim_t x = 0; x < wx; x++) {
-                            dim_t xpad = spx + x;
-                            dim_t ypad = spy + y;
+template<typename T>
+Array<T> wrap(const Array<T> &in,
+              const dim_t ox, const dim_t oy,
+              const dim_t wx, const dim_t wy,
+              const dim_t sx, const dim_t sy,
+              const dim_t px, const dim_t py,
+              const bool is_column)
+{
+    af::dim4 idims = in.dims();
+    af::dim4 odims(ox, oy, idims[2], idims[3]);
 
-                            dim_t iloc = (y * wx + x);
-                            if (d == 0) iloc *= istrides[1];
+    Array<T> out = createValueArray<T>(odims, scalar<T>(0));
+    out.eval();
+    in.eval();
 
-                            if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) {
-                                dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]);
-                                // FIXME: When using threads, atomize this
-                                optr[oloc] += iptr[iloc];
-                            }
-                        }
-                    }
-                }
-            }
-        }
+    if (is_column) {
+        getQueue().enqueue(kernel::wrap_dim<T, 1>, out, in, wx, wy, sx, sy, px, py);
+    } else {
+        getQueue().enqueue(kernel::wrap_dim<T, 0>, out, in, wx, wy, sx, sy, px, py);
     }
 
-    template<typename T>
-    Array<T> wrap(const Array<T> &in,
-                  const dim_t ox, const dim_t oy,
-                  const dim_t wx, const dim_t wy,
-                  const dim_t sx, const dim_t sy,
-                  const dim_t px, const dim_t py,
-                  const bool is_column)
-    {
-        af::dim4 idims = in.dims();
-        af::dim4 odims(ox, oy, idims[2], idims[3]);
-        Array<T> out = createValueArray<T>(odims, scalar<T>(0));
-
-        const T *inPtr = in.get();
-        T *outPtr = out.get();
-
-        af::dim4 istrides = in.strides();
-        af::dim4 ostrides = out.strides();
-
-        if (is_column) {
-            wrap_dim<T, true >(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
-        } else {
-            wrap_dim<T, false>(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
-        }
-
-        return out;
-    }
+    return out;
+}
 
 
 #define INSTANTIATE(T)                                          \
@@ -108,17 +50,17 @@ namespace cpu
                                const dim_t px, const dim_t py,  \
                                const bool is_column);
 
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp
index 275ea13a99..786574129b 100644
--- a/src/backend/cuda/Array.cpp
+++ b/src/backend/cuda/Array.cpp
@@ -16,6 +16,7 @@
 #include <memory.hpp>
 #include <platform.hpp>
 #include <cstddef>
+#include <MemoryManager.hpp>
 
 using af::dim4;
 
@@ -29,17 +30,17 @@ namespace cuda
 
     template<typename T>
     Array<T>::Array(af::dim4 dims) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
+        node(), ready(true), owner(true)
     {}
 
     template<typename T>
     Array<T>::Array(af::dim4 dims, const T * const in_data, bool is_device, bool copy_device) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(((is_device & !copy_device) ? (T *)in_data : memAlloc<T>(dims.elements())), memFree<T>),
         data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
+        node(), ready(true), owner(true)
     {
 #if __cplusplus > 199711L
         static_assert(std::is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
@@ -57,34 +58,51 @@ namespace cuda
     }
 
     template<typename T>
-    Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) :
-        info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits<T>::af_type),
+    Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim_t &offset_, const dim4 &strides) :
+        info(parent.getDevId(), dims, offset_, strides, (af_dtype)dtype_traits<T>::af_type),
         data(parent.getData()), data_dims(parent.getDataDims()),
         node(),
-        offset(parent.getOffset() + calcOffset(parent.strides(), offsets)),
         ready(true), owner(false)
     { }
 
     template<typename T>
     Array<T>::Array(Param<T> &tmp) :
-        info(getActiveDeviceId(), af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3]),
-                  af::dim4(0, 0, 0, 0),
-                  af::dim4(tmp.strides[0], tmp.strides[1], tmp.strides[2], tmp.strides[3]),
-                  (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(),
+             af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3]),
+             0,
+             af::dim4(tmp.strides[0], tmp.strides[1], tmp.strides[2], tmp.strides[3]),
+             (af_dtype)dtype_traits<T>::af_type),
         data(tmp.ptr, memFree<T>),
         data_dims(af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3])),
-        node(), offset(0), ready(true), owner(true)
+        node(), ready(true), owner(true)
     {
     }
 
     template<typename T>
     Array<T>::Array(af::dim4 dims, JIT::Node_ptr n) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(), data_dims(dims),
-        node(n), offset(0), ready(false), owner(true)
+        node(n), ready(false), owner(true)
     {
     }
 
+    template<typename T>
+    Array<T>::Array(af::dim4 dims, af::dim4 strides, dim_t offset_,
+                    const T * const in_data, bool is_device) :
+        info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits<T>::af_type),
+        data(is_device ? (T*)in_data : memAlloc<T>(info.total()), memFree<T>),
+        data_dims(dims),
+        node(),
+        ready(true),
+        owner(true)
+    {
+        if (!is_device) {
+            cudaStream_t stream = getStream(getActiveDeviceId());
+            CUDA_CHECK(cudaMemcpyAsync(data.get(), in_data, info.total() * sizeof(T),
+                                       cudaMemcpyHostToDevice, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+        }
+    }
 
     template<typename T>
     void Array<T>::eval()
@@ -148,9 +166,9 @@ namespace cuda
         n->getInfo(length, buf_count, bytes);
         n->resetFlags();
 
-        if (length > MAX_JIT_LEN ||
-            buf_count >= MAX_BUFFERS ||
-            bytes >= MAX_BYTES) {
+        if (length > getMaxJitSize() ||
+            buf_count >= getMaxBuffers() ||
+            bytes >= getMaxBytes()) {
             out.eval();
         }
 
@@ -197,18 +215,23 @@ namespace cuda
         dim4 dDims = parent.getDataDims();
         dim4 pDims = parent.dims();
 
-        dim4 dims   = toDims  (index, pDims);
-        dim4 offset = toOffset(index, dDims);
-        dim4 stride = toStride (index, dDims);
+        dim4 dims    = toDims  (index, pDims);
+        dim4 strides = toStride (index, dDims);
+
+        // Find total offsets after indexing
+        dim4 offsets = toOffset(index, pDims);
+        dim4 parent_strides = parent.strides();
+        dim_t offset = parent.getOffset();
+        for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i];
 
-        Array<T> out = Array<T>(parent, dims, offset, stride);
+        Array<T> out = Array<T>(parent, dims, offset, strides);
 
         if (!copy) return out;
 
-        if (stride[0] != 1 ||
-            stride[1] <  0 ||
-            stride[2] <  0 ||
-            stride[3] <  0) {
+        if (strides[0] != 1 ||
+            strides[1] <  0 ||
+            strides[2] <  0 ||
+            strides[3] <  0) {
 
             out = copyArray(out);
         }
@@ -228,23 +251,17 @@ namespace cuda
         delete A;
     }
 
-    template<typename T>
-    void evalArray(const Array<T> &A)
-    {
-        A.eval();
-    }
-
     template<typename T>
     void
     writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
     {
         if (!arr.isOwner()) {
-            arr = createEmptyArray<T>(arr.dims());
+            arr = copyArray<T>(arr);
         }
 
         T *ptr = arr.get();
 
-        CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data, bytes, cudaMemcpyHostToDevice,
+        CUDA_CHECK(cudaMemcpyAsync(ptr, data, bytes, cudaMemcpyHostToDevice,
                     cuda::getStream(cuda::getActiveDeviceId())));
         CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
 
@@ -256,12 +273,12 @@ namespace cuda
     writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
     {
         if (!arr.isOwner()) {
-            arr = createEmptyArray<T>(arr.dims());
+            arr = copyArray<T>(arr);
         }
 
         T *ptr = arr.get();
 
-        CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data,
+        CUDA_CHECK(cudaMemcpyAsync(ptr, data,
                                    bytes, cudaMemcpyDeviceToDevice,
                                    cuda::getStream(cuda::getActiveDeviceId())));
 
@@ -279,11 +296,14 @@ namespace cuda
                                                        const std::vector<af_seq> &index, \
                                                        bool copy);      \
     template       void      destroyArray<T>          (Array<T> *A);    \
-    template       void      evalArray<T>             (const Array<T> &A); \
     template       Array<T>  createNodeArray<T>       (const dim4 &size, JIT::Node_ptr node); \
+    template       Array<T>::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \
+                                   const T * const in_data,             \
+                                   bool is_device);                     \
     template       Array<T>::Array(af::dim4 dims, const T * const in_data, \
                                    bool is_device, bool copy_device);   \
     template       Array<T>::~Array        ();                          \
+    template       Node_ptr Array<T>::getNode() const;             \
     template       void Array<T>::eval();                               \
     template       void Array<T>::eval() const;                         \
     template       void      writeHostDataArray<T>    (Array<T> &arr, const T * const data, const size_t bytes); \
diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp
index 598fdfd35e..7678754bc3 100644
--- a/src/backend/cuda/Array.hpp
+++ b/src/backend/cuda/Array.hpp
@@ -78,9 +78,6 @@ namespace cuda
                             const std::vector<af_seq> &index,
                             bool copy=true);
 
-    template<typename T>
-    void evalArray(const Array<T> &A);
-
     // Creates a new Array object on the heap and returns a reference to it.
     template<typename T>
     void destroyArray(Array<T> *A);
@@ -89,10 +86,16 @@ namespace cuda
     void *getDevicePtr(const Array<T>& arr)
     {
         T *ptr = arr.device();
-        memPop(ptr);
+        memLock(ptr);
         return (void *)ptr;
     }
 
+    template<typename T>
+    void *getRawPtr(const Array<T>& arr)
+    {
+        return (void *)(arr.get(false));
+    }
+
     template<typename T>
     class Array
     {
@@ -101,17 +104,20 @@ namespace cuda
         af::dim4 data_dims;
 
         JIT::Node_ptr node;
-        dim_t offset;
         bool ready;
         bool owner;
 
         Array(af::dim4 dims);
+
         explicit Array(af::dim4 dims, const T * const in_data, bool is_device = false, bool copy_device = false);
-        Array(const Array<T>& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride);
+        Array(const Array<T>& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride);
         Array(Param<T> &tmp);
         Array(af::dim4 dims, JIT::Node_ptr n);
     public:
 
+        Array(af::dim4 dims, af::dim4 strides, dim_t offset,
+              const T * const in_data, bool is_device = false);
+
         void resetInfo(const af::dim4& dims)        { info.resetInfo(dims);         }
         void resetDims(const af::dim4& dims)        { info.resetDims(dims);         }
         void modDims(const af::dim4 &newDims)       { info.modDims(newDims);        }
@@ -122,7 +128,6 @@ namespace cuda
     RET_TYPE NAME() const { return info.NAME(); }
 
         INFO_FUNC(const af_dtype& ,getType)
-        INFO_FUNC(const af::dim4& ,offsets)
         INFO_FUNC(const af::dim4& ,strides)
         INFO_FUNC(size_t          ,elements)
         INFO_FUNC(size_t          ,ndims)
@@ -159,7 +164,7 @@ namespace cuda
         void eval();
         void eval() const;
 
-        dim_t getOffset() const { return offset; }
+        dim_t getOffset() const { return info.getOffset(); }
         shared_ptr<T> getData() const { return data; }
 
         dim4 getDataDims() const
@@ -169,6 +174,11 @@ namespace cuda
             return isOwner() ? dims() : data_dims;
         }
 
+        void setDataDims(const dim4 &new_dims)
+        {
+            data_dims = new_dims;
+        }
+
         T* device()
         {
             if (!isOwner() || data.use_count() > 1) {
@@ -192,7 +202,7 @@ namespace cuda
         const   T* get(bool withOffset = true) const
         {
             if (!isReady()) eval();
-            return data.get() + (withOffset ? offset : 0);
+            return data.get() + (withOffset ? getOffset() : 0);
         }
 
         int useCount() const
@@ -234,8 +244,8 @@ namespace cuda
                                           bool copy);
 
         friend void destroyArray<T>(Array<T> *arr);
-        friend void evalArray<T>(const Array<T> &arr);
         friend void *getDevicePtr<T>(const Array<T>& arr);
+        friend void *getRawPtr<T>(const Array<T>& arr);
     };
 
 }
diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
index bb8fca013c..ae0690dba2 100644
--- a/src/backend/cuda/CMakeLists.txt
+++ b/src/backend/cuda/CMakeLists.txt
@@ -18,6 +18,7 @@ IF(    CUDA_COMPUTE_20
     OR CUDA_COMPUTE_30
     OR CUDA_COMPUTE_32
     OR CUDA_COMPUTE_35
+    OR CUDA_COMPUTE_37
     OR CUDA_COMPUTE_50
     OR CUDA_COMPUTE_52
     OR CUDA_COMPUTE_53
@@ -44,11 +45,12 @@ MACRO(SET_COMPUTE VERSION)
     SET(CUDA_GENERATE_CODE_${VERSION} "-gencode arch=compute_${VERSION},code=sm_${VERSION}")
     SET(CUDA_GENERATE_CODE ${CUDA_GENERATE_CODE} ${CUDA_GENERATE_CODE_${VERSION}})
     LIST(APPEND COMPUTE_VERSIONS "${VERSION}")
+    ADD_DEFINITIONS(-DCUDA_COMPUTE_${VERSION})
     MESSAGE(STATUS "Setting Compute ${VERSION} to ON")
 ENDMACRO(SET_COMPUTE)
 
 # Iterate over compute versions. Create variables and enable computes if needed
-FOREACH(VER 20 30 32 35 50 52 53)
+FOREACH(VER 20 30 32 35 37 50 52 53)
     OPTION(CUDA_COMPUTE_${VER} "CUDA Compute Capability ${VER}" OFF)
     MARK_AS_ADVANCED(CUDA_COMPUTE_${VER})
     IF(${CUDA_COMPUTE_${VER}})
@@ -57,8 +59,9 @@ FOREACH(VER 20 30 32 35 50 52 53)
 ENDFOREACH()
 
 IF(UNIX)
+    # Forcing STRICT ANSI should resolve a bunch of issues that NVIDIA seems to face with GCC compilers.
+    ADD_DEFINITIONS(-D__STRICT_ANSI__)
     SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fvisibility=hidden)
-    REMOVE_DEFINITIONS(-std=c++0x)
     IF(${WITH_COVERAGE})
         SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fprofile-arcs -Xcompiler -ftest-coverage -Xlinker -fprofile-arcs -Xlinker -ftest-coverage")
     ENDIF(${WITH_COVERAGE})
@@ -68,50 +71,70 @@ ENDIF()
 
 ADD_DEFINITIONS(-DAF_CUDA)
 
-IF(${CUDA_VERSION_MAJOR} LESS 7)
+# CMake 3.2 Adds CUDA_cusolver_LIBRARY variable to FindCUDA
+# Older version, use FIND_LIBRARY
+IF(CMAKE_VERSION VERSION_LESS 3.2)
+    IF(${CUDA_cusolver_LIBRARY} MATCHES " ")
+        UNSET(CUDA_cusolver_LIBRARY CACHE)  # When going from higher version to lower version
+    ENDIF()
+    FIND_LIBRARY (
+        CUDA_cusolver_LIBRARY
+        NAMES "cusolver"
+        PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+        PATH_SUFFIXES "lib64" "lib/x64" "lib"
+        DOC "CUDA cusolver Library"
+        NO_DEFAULT_PATH
+        )
+ENDIF(CMAKE_VERSION VERSION_LESS 3.2)
+
+IF(${CUDA_VERSION_MAJOR} LESS 7 AND CUDA_cusolver_LIBRARY)
+  UNSET(CUDA_cusolver_LIBRARY CACHE)  # Failsafe when going from higher version to lower version
+ENDIF()
+
+IF(CUDA_cusolver_LIBRARY)
+    MESSAGE(STATUS "CUDA cusolver library available in CUDA Version ${CUDA_VERSION_STRING}")
+    ADD_DEFINITIONS(-DWITH_CUDA_LINEAR_ALGEBRA)
+ELSE(CUDA_cusolver_LIBRARY)
     # Use CPU Lapack as fallback?
-    OPTION(CUDA_LAPACK_CPU_FALLBACK "Use CPU LAPACK as fallback for CUDA LAPACK when CUDA is 6.5 or older" OFF)
+    OPTION(CUDA_LAPACK_CPU_FALLBACK "Use CPU LAPACK as fallback for CUDA LAPACK when cusolver is not available" OFF)
     MARK_AS_ADVANCED(CUDA_LAPACK_CPU_FALLBACK)
 
     IF(${CUDA_LAPACK_CPU_FALLBACK})
         ## Try to use CPU side lapack
         IF(APPLE)
-            FIND_PACKAGE(LAPACK)
+            FIND_PACKAGE(LAPACKE QUIET) # For finding MKL
+            IF(NOT LAPACK_FOUND)
+                # UNSET THE VARIABLES FROM LAPACKE
+                UNSET(LAPACKE_LIB CACHE)
+                UNSET(LAPACK_LIB CACHE)
+                UNSET(LAPACKE_INCLUDES CACHE)
+                UNSET(LAPACKE_ROOT_DIR CACHE)
+                FIND_PACKAGE(LAPACK)
+            ENDIF()
         ELSE(APPLE) # Linux and Windows
             FIND_PACKAGE(LAPACKE)
         ENDIF(APPLE)
 
         IF(NOT LAPACK_FOUND)
-            MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.")
+            MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. Linear Algebra will not be available.")
         ELSE(NOT LAPACK_FOUND)
-            MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. But CPU LAPACK libraries are available. Will fallback to using host side code.")
+            MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. But CPU LAPACK libraries are available. Will fallback to using host side code.")
             ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA)
-            IF(USE_CUDA_MKL)
-                MESSAGE("Using MKL")
+            IF(USE_CUDA_MKL) # Manual MKL Setup
+                MESSAGE("CUDA LAPACK CPU Fallback Using MKL")
                 ADD_DEFINITIONS(-DUSE_MKL)
+            ELSE(USE_CUDA_MKL)
+                IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS
+                    MESSAGE("CUDA LAPACK CPU Fallback Using MKL RT")
+                    ADD_DEFINITIONS(-DUSE_MKL)
+                ENDIF()
             ENDIF()
         ENDIF()
     ELSE()
-        MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.")
+        MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. Linear Algebra will not be available.")
     ENDIF()
-    IF(CMAKE_VERSION VERSION_LESS 3.2)
-        SET(CUDA_cusolver_LIBRARY)
-        MARK_AS_ADVANCED(CUDA_cusolver_LIBRARY)
-    ENDIF(CMAKE_VERSION VERSION_LESS 3.2)
-ELSE(${CUDA_VERSION_MAJOR} LESS 7)
-    MESSAGE(STATUS "CUDA cusolver library available in CUDA Version ${CUDA_VERSION_STRING}")
-    ADD_DEFINITIONS(-DWITH_CUDA_LINEAR_ALGEBRA)
-    IF(CMAKE_VERSION VERSION_LESS 3.2)
-        FIND_LIBRARY(
-            CUDA_cusolver_LIBRARY
-            NAMES "cusolver"
-            PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-            PATH_SUFFIXES "lib64" "lib/x64" "lib"
-            DOC "CUDA cusolver Library"
-            NO_DEFAULT_PATH
-            )
-    ENDIF(CMAKE_VERSION VERSION_LESS 3.2)
-ENDIF(${CUDA_VERSION_MAJOR} LESS 7)
+    UNSET(CUDA_cusolver_LIBRARY CACHE)  # Failsafe when going from higher version to lower version
+ENDIF(CUDA_cusolver_LIBRARY)
 
 INCLUDE_DIRECTORIES(
     ${CMAKE_INCLUDE_PATH}
@@ -308,7 +331,6 @@ ADD_DEPENDENCIES(afcuda ${ptx_targets})
 
 TARGET_LINK_LIBRARIES(afcuda    PRIVATE ${CUDA_CUBLAS_LIBRARIES}
                                 PRIVATE ${CUDA_LIBRARIES}
-                                PRIVATE ${CUDA_cusolver_LIBRARY}
                                 PRIVATE ${FreeImage_LIBS}
                                 PRIVATE ${CUDA_CUFFT_LIBRARIES}
                                 PRIVATE ${CUDA_NVVM_LIBRARIES}
@@ -318,8 +340,10 @@ IF(FORGE_FOUND)
     TARGET_LINK_LIBRARIES(afcuda PRIVATE ${FORGE_LIBRARIES})
 ENDIF()
 
-IF(CUDA_LAPACK_CPU_FALLBACK)
-  TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES})
+IF(CUDA_cusolver_LIBRARY)
+    TARGET_LINK_LIBRARIES(afcuda PRIVATE ${CUDA_cusolver_LIBRARY})
+ELSEIF(CUDA_LAPACK_CPU_FALLBACK)
+    TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES})
 ENDIF()
 
 SET_TARGET_PROPERTIES(afcuda PROPERTIES
diff --git a/src/backend/cuda/JIT/BinaryNode.hpp b/src/backend/cuda/JIT/BinaryNode.hpp
index 2a2abb0610..f916d85576 100644
--- a/src/backend/cuda/JIT/BinaryNode.hpp
+++ b/src/backend/cuda/JIT/BinaryNode.hpp
@@ -126,11 +126,7 @@ namespace JIT
 
         void resetFlags()
         {
-            m_set_id = false;
-            m_gen_func = false;
-            m_gen_param = false;
-            m_gen_offset = false;
-            m_set_arg = false;
+            resetCommonFlags();
             m_lhs->resetFlags();
             m_rhs->resetFlags();
         }
diff --git a/src/backend/cuda/JIT/BufferNode.hpp b/src/backend/cuda/JIT/BufferNode.hpp
index efe32f8b72..342e1ed0b7 100644
--- a/src/backend/cuda/JIT/BufferNode.hpp
+++ b/src/backend/cuda/JIT/BufferNode.hpp
@@ -178,12 +178,7 @@ namespace JIT
 
         void resetFlags()
         {
-            m_set_id = false;
-            m_gen_func = false;
-            m_gen_param = false;
-            m_gen_offset = false;
-            m_gen_name = false;
-            m_set_arg = false;
+            resetCommonFlags();
         }
 
         void setArgs(std::vector<void *> &args, bool is_linear)
diff --git a/src/backend/cuda/JIT/Node.hpp b/src/backend/cuda/JIT/Node.hpp
index e30a1cf63b..00fed9fda7 100644
--- a/src/backend/cuda/JIT/Node.hpp
+++ b/src/backend/cuda/JIT/Node.hpp
@@ -37,6 +37,19 @@ namespace JIT
         bool m_set_arg;
         bool m_gen_name;
 
+    protected:
+
+        void resetCommonFlags()
+        {
+            m_set_id = false;
+            m_gen_func = false;
+            m_gen_param = false;
+            m_gen_offset = false;
+            m_set_arg = false;
+            m_gen_name = false;
+        }
+
+
     public:
 
         Node(const char *type_str, const char *name_str)
@@ -62,7 +75,11 @@ namespace JIT
         virtual void setArgs(std::vector<void *> &args, bool is_linear) { m_set_arg = true; }
         virtual bool isLinear(dim_t dims[4]) { return true; }
 
-        virtual void resetFlags() {}
+        virtual void resetFlags()
+        {
+            resetCommonFlags();
+        }
+
         virtual void getInfo(unsigned &len, unsigned &buf_count, unsigned &bytes)
         {
             len = 0;
diff --git a/src/backend/cuda/JIT/ScalarNode.hpp b/src/backend/cuda/JIT/ScalarNode.hpp
index 288af4dcdb..34f316d34b 100644
--- a/src/backend/cuda/JIT/ScalarNode.hpp
+++ b/src/backend/cuda/JIT/ScalarNode.hpp
@@ -87,12 +87,7 @@ namespace JIT
 
         void resetFlags()
         {
-            m_set_id = false;
-            m_gen_func = false;
-            m_gen_param = false;
-            m_gen_offset = false;
-            m_gen_name = false;
-            m_set_arg = false;
+            resetCommonFlags();
         }
 
         void setArgs(std::vector<void *> &args, bool is_linear)
diff --git a/src/backend/cuda/JIT/UnaryNode.hpp b/src/backend/cuda/JIT/UnaryNode.hpp
index caa573104b..94ee96ece7 100644
--- a/src/backend/cuda/JIT/UnaryNode.hpp
+++ b/src/backend/cuda/JIT/UnaryNode.hpp
@@ -118,11 +118,7 @@ namespace JIT
 
         void resetFlags()
         {
-            m_set_id = false;
-            m_gen_func = false;
-            m_gen_param = false;
-            m_gen_offset = false;
-            m_set_arg = false;
+            resetCommonFlags();
             m_child->resetFlags();
         }
 
diff --git a/src/backend/cuda/JIT/numeric.cu b/src/backend/cuda/JIT/numeric.cu
index 8253db6d22..2bcb15a112 100644
--- a/src/backend/cuda/JIT/numeric.cu
+++ b/src/backend/cuda/JIT/numeric.cu
@@ -119,6 +119,19 @@ MATH_CAST(lgamma, intl  , float)
 MATH_CAST(lgamma, ushort, float)
 MATH_CAST(lgamma, short , float)
 
+MATH_NOOP(noop, float)
+MATH_NOOP(noop, double)
+MATH_NOOP(noop, cfloat)
+MATH_NOOP(noop, cdouble)
+MATH_NOOP(noop, int)
+MATH_NOOP(noop, uint)
+MATH_NOOP(noop, char)
+MATH_NOOP(noop, uchar)
+MATH_NOOP(noop, uintl)
+MATH_NOOP(noop, intl)
+MATH_NOOP(noop, ushort)
+MATH_NOOP(noop, short)
+
 __device__ float ___abs(cfloat a) { return cuCabsf(a); }
 __device__ double ___abs(cdouble a) { return cuCabs(a); }
 
diff --git a/src/backend/cuda/blas.cpp b/src/backend/cuda/blas.cpp
index 85f48da750..9d3b9ca7b7 100644
--- a/src/backend/cuda/blas.cpp
+++ b/src/backend/cuda/blas.cpp
@@ -18,6 +18,9 @@
 #include <math.hpp>
 #include <err_common.hpp>
 #include <cublasManager.hpp>
+#include <arith.hpp>
+#include <reduce.hpp>
+#include <complex.hpp>
 
 namespace cuda
 {
@@ -197,40 +200,15 @@ Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
 
 }
 
-template<typename T, bool conjugate, bool both_conjugate>
-Array<T> dot_(const Array<T> &lhs, const Array<T> &rhs,
-              af_mat_prop optLhs, af_mat_prop optRhs)
-{
-    int N = lhs.dims()[0];
-
-    T out;
-
-    CUBLAS_CHECK((dot_func<T, conjugate>()(
-                 getHandle(),
-                 N,
-                 lhs.get(), lhs.strides()[0],
-                 rhs.get(), rhs.strides()[0],
-                 &out)));
-
-    if(both_conjugate)
-        return createValueArray(af::dim4(1), conj(out));
-    else
-        return createValueArray(af::dim4(1), out);
-}
-
 template<typename T>
 Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
              af_mat_prop optLhs, af_mat_prop optRhs)
 {
-    if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) {
-        return dot_<T, false, true>(lhs, rhs, optLhs, optRhs);
-    } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) {
-        return dot_<T, true, false>(lhs, rhs, optLhs, optRhs);
-    } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) {
-        return dot_<T, true, false>(rhs, lhs, optRhs, optLhs);
-    } else {
-        return dot_<T, false, false>(lhs, rhs, optLhs, optRhs);
-    }
+    const Array<T> lhs_ = (optLhs == AF_MAT_NONE ? lhs : conj<T>(lhs));
+    const Array<T> rhs_ = (optRhs == AF_MAT_NONE ? rhs : conj<T>(rhs));
+
+    const Array<T> temp = arithOp<T, af_mul_t>(lhs_, rhs_, lhs_.dims());
+    return reduce<af_add_t, T, T>(temp, 0, false, 0);
 }
 
 template<typename T>
diff --git a/src/backend/cuda/complex.hpp b/src/backend/cuda/complex.hpp
index 82304b9a22..b7de74a7de 100644
--- a/src/backend/cuda/complex.hpp
+++ b/src/backend/cuda/complex.hpp
@@ -17,25 +17,25 @@
 
 namespace cuda
 {
-    template<typename T> static const std::string cplx_name() { return "@___noop"; }
-    template<> STATIC_ const std::string cplx_name<cfloat >() { return cuMangledName<float , true>("___cplx"); }
-    template<> STATIC_ const std::string cplx_name<cdouble>() { return cuMangledName<double, true>("___cplx"); }
+    template<typename T> static const std::string cplx_name() { return cuMangledName<T, false>("___noop");       }
+    template<> STATIC_ const std::string cplx_name<cfloat >() { return cuMangledName<float , true>("___cplx");   }
+    template<> STATIC_ const std::string cplx_name<cdouble>() { return cuMangledName<double, true>("___cplx");   }
 
-    template<typename T> static const std::string real_name() { return "@___noop"; }
+    template<typename T> static const std::string real_name() { return cuMangledName<T, false>("___noop");       }
     template<> STATIC_ const std::string real_name<cfloat >() { return cuMangledName<cfloat , false>("___real"); }
     template<> STATIC_ const std::string real_name<cdouble>() { return cuMangledName<cdouble, false>("___real"); }
 
-    template<typename T> static const std::string imag_name() { return "@___noop"; }
+    template<typename T> static const std::string imag_name() { return cuMangledName<T, false>("___noop");       }
     template<> STATIC_ const std::string imag_name<cfloat >() { return cuMangledName<cfloat , false>("___imag"); }
     template<> STATIC_ const std::string imag_name<cdouble>() { return cuMangledName<cdouble, false>("___imag"); }
 
-    template<typename T> static const std::string abs_name() { return "@___noop"; }
-    template<> STATIC_ const std::string abs_name<float  >() { return cuMangledName<float  , false>("___abs"); }
-    template<> STATIC_ const std::string abs_name<double >() { return cuMangledName<double , false>("___abs"); }
-    template<> STATIC_ const std::string abs_name<cfloat >() { return cuMangledName<cfloat , false>("___abs"); }
-    template<> STATIC_ const std::string abs_name<cdouble>() { return cuMangledName<cdouble, false>("___abs"); }
+    template<typename T> static const std::string abs_name()  { return cuMangledName<T, false>("___noop");       }
+    template<> STATIC_ const std::string abs_name<float  >()  { return cuMangledName<float  , false>("___abs");  }
+    template<> STATIC_ const std::string abs_name<double >()  { return cuMangledName<double , false>("___abs");  }
+    template<> STATIC_ const std::string abs_name<cfloat >()  { return cuMangledName<cfloat , false>("___abs");  }
+    template<> STATIC_ const std::string abs_name<cdouble>()  { return cuMangledName<cdouble, false>("___abs");  }
 
-    template<typename T> static const std::string conj_name() { return "@___noop"; }
+    template<typename T> static const std::string conj_name() { return cuMangledName<T, false>("___noop");       }
     template<> STATIC_ const std::string conj_name<cfloat >() { return cuMangledName<cfloat , false>("___conj"); }
     template<> STATIC_ const std::string conj_name<cdouble>() { return cuMangledName<cdouble, false>("___conj"); }
 
diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu
index 90f9970239..35e5c83178 100644
--- a/src/backend/cuda/copy.cu
+++ b/src/backend/cuda/copy.cu
@@ -23,12 +23,12 @@ namespace cuda
     void copyData(T *data, const Array<T> &A)
     {
         // FIXME: Merge this with copyArray
-        evalArray(A);
+        A.eval();
 
         Array<T> out = A;
         const T *ptr = NULL;
 
-        if (A.isOwner() || // No offsets, No strides
+        if (A.isLinear() || // No offsets, No strides
             A.ndims() == 1 // Simple offset, no strides.
             ) {
 
@@ -71,7 +71,6 @@ namespace cuda
         ARG_ASSERT(1, (in.ndims() == dims.ndims()));
         Array<outType> ret = createEmptyArray<outType>(dims);
         kernel::copy<inType, outType>(ret, in, in.ndims(), default_value, factor);
-        CUDA_CHECK(cudaDeviceSynchronize());
         return ret;
     }
 
diff --git a/src/backend/cuda/cpu_lapack/lapack_helper.hpp b/src/backend/cuda/cpu_lapack/lapack_helper.hpp
index 58265871c2..b85a80b10c 100644
--- a/src/backend/cuda/cpu_lapack/lapack_helper.hpp
+++ b/src/backend/cuda/cpu_lapack/lapack_helper.hpp
@@ -19,17 +19,17 @@
 #define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR
 #define LAPACK_NAME(fn) LAPACKE_##fn
 
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#include <lapacke.hpp>
-#undef AF_LAPACK_COL_MAJOR
-#define AF_LAPACK_COL_MAJOR 0
-#else
 #ifdef USE_MKL
-#include<mkl_lapacke.h>
-#else // NETLIB LAPACKE
-#include<lapacke.h>
-#endif
+    #include<mkl_lapacke.h>
+#else
+    #ifdef __APPLE__
+        #include <Accelerate/Accelerate.h>
+        #include <lapacke.hpp>
+        #undef AF_LAPACK_COL_MAJOR
+        #define AF_LAPACK_COL_MAJOR 0
+    #else // NETLIB LAPACKE
+        #include<lapacke.h>
+    #endif
 #endif
 
 #endif
diff --git a/src/backend/cuda/debug_cuda.hpp b/src/backend/cuda/debug_cuda.hpp
index 084d12f804..f5424950dc 100644
--- a/src/backend/cuda/debug_cuda.hpp
+++ b/src/backend/cuda/debug_cuda.hpp
@@ -51,8 +51,12 @@
 
 #else
 
-#define POST_LAUNCH_CHECK() do {                \
-        CUDA_CHECK(cudaPeekAtLastError());      \
-    } while(0)                                  \
+#define POST_LAUNCH_CHECK() do {                                        \
+    if(cuda::synchronize_calls()) {                                     \
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); \
+    } else {                                                            \
+        CUDA_CHECK(cudaPeekAtLastError());                              \
+    }                                                                   \
+  } while(0)                                                            \
 
 #endif
diff --git a/src/backend/cuda/err_cuda.hpp b/src/backend/cuda/err_cuda.hpp
index a975fb5336..dd87bdfc2b 100644
--- a/src/backend/cuda/err_cuda.hpp
+++ b/src/backend/cuda/err_cuda.hpp
@@ -17,22 +17,23 @@
                 __AF_FILENAME__, __LINE__, "CUDA");     \
     } while(0)
 
-#define CUDA_CHECK(fn) do {                                 \
-        cudaError_t _cuda_error = fn;                       \
-        if (_cuda_error != cudaSuccess) {                   \
-            char cuda_err_msg[1024];                        \
-            snprintf(cuda_err_msg,                          \
-                     sizeof(cuda_err_msg),                  \
-                     "CUDA Error (%d): %s\n",               \
-                     (int)(_cuda_error),                    \
-                     cudaGetErrorString(                    \
-                         cudaGetLastError()));              \
-                                                            \
-            if (_cuda_error == cudaErrorMemoryAllocation) { \
-                AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM);      \
-            } else {                                        \
-                AF_ERROR(cuda_err_msg,                      \
-                         AF_ERR_INTERNAL);                  \
-            }                                               \
-        }                                                   \
+#define CUDA_CHECK(fn) do {                                         \
+        cudaError_t _cuda_error = fn;                               \
+        if (_cuda_error != cudaSuccess) {                           \
+            char cuda_err_msg[1024];                                \
+            snprintf(cuda_err_msg,                                  \
+                     sizeof(cuda_err_msg),                          \
+                     "CUDA Error (%d): %s\n",                       \
+                     (int)(_cuda_error),                            \
+                     cudaGetErrorString(                            \
+                         cudaGetLastError()));                      \
+                                                                    \
+            if (_cuda_error == cudaErrorMemoryAllocation) {         \
+                AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM);              \
+            } else if (_cuda_error == cudaErrorDevicesUnavailable) {\
+                AF_ERROR(cuda_err_msg, AF_ERR_DRIVER);              \
+            } else {                                                \
+                AF_ERROR(cuda_err_msg, AF_ERR_INTERNAL);            \
+            }                                                       \
+        }                                                           \
     } while(0)
diff --git a/src/backend/cuda/interopManager.cu b/src/backend/cuda/interopManager.cu
index b492a5ee1d..a6e2fcf9bd 100644
--- a/src/backend/cuda/interopManager.cu
+++ b/src/backend/cuda/interopManager.cu
@@ -14,6 +14,7 @@
 
 #include <interopManager.hpp>
 #include <err_cuda.hpp>
+#include <util.hpp>
 #include <cstdio>
 
 namespace cuda
@@ -36,10 +37,10 @@ InteropManager::~InteropManager()
         }
     } catch (AfError &ex) {
 
-        const char* perr = getenv("AF_PRINT_ERRORS");
-
-        if(perr && perr[0] != '0') {
-            fprintf(stderr, "%s\n", ex.what());
+        std::string perr = getEnvVar("AF_PRINT_ERRORS");
+        if(!perr.empty()) {
+            if(perr != "0")
+                fprintf(stderr, "%s\n", ex.what());
         }
     }
 }
diff --git a/src/backend/cuda/kernel/fast_pyramid.hpp b/src/backend/cuda/kernel/fast_pyramid.hpp
index 61a9c7ac32..d2e5903788 100644
--- a/src/backend/cuda/kernel/fast_pyramid.hpp
+++ b/src/backend/cuda/kernel/fast_pyramid.hpp
@@ -65,7 +65,11 @@ void fast_pyramid(std::vector<unsigned>& feat_pyr,
     lvl_best[max_levels-1] = max_feat - feat_sum;
 
     // Hold multi-scale image pyramids
-    img_pyr.reserve(max_levels);
+    static const dim4 dims0;
+    static const CParam<T> emptyCParam(NULL, dims0.get(), dims0.get());
+    // Need to do this as CParam does not have a default constructor
+    // And resize needs a default constructor or default value prior to C++11
+    img_pyr.resize(max_levels, emptyCParam);
 
     // Create multi-scale image pyramid
     for (unsigned i = 0; i < max_levels; i++) {
diff --git a/src/backend/cuda/kernel/random.hpp b/src/backend/cuda/kernel/random.hpp
index 4d960ae46b..96cf098c03 100644
--- a/src/backend/cuda/kernel/random.hpp
+++ b/src/backend/cuda/kernel/random.hpp
@@ -49,8 +49,18 @@ namespace kernel
 
         ~curandStateManager()
         {
-            //if(_state != NULL) memFree((char*)_state);
-            if(_state != NULL) CUDA_CHECK(cudaFree(_state));
+            try {
+                if (_state != NULL) {
+                    cudaError_t err = cudaFree(_state);
+                    if (err != cudaErrorCudartUnloading) {
+                        CUDA_CHECK(err);
+                    }
+                }
+            } catch (AfError err) {
+                if (err.getError() != AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable
+                    throw err;
+                }
+            }
         }
 
         unsigned long long getSeed() const
@@ -69,7 +79,6 @@ namespace kernel
             if(_state)
                 return _state;
 
-            //_state = (curandState_t*)memAlloc<char>(BLOCKS * THREADS * sizeof(curandState_t));
             CUDA_CHECK(cudaMalloc((void **)&_state, BLOCKS * THREADS * sizeof(curandState_t)));
             this->resetSeed();
             return _state;
diff --git a/src/backend/cuda/kernel/rotate.hpp b/src/backend/cuda/kernel/rotate.hpp
index d63f010c3b..3cea7f2698 100644
--- a/src/backend/cuda/kernel/rotate.hpp
+++ b/src/backend/cuda/kernel/rotate.hpp
@@ -60,11 +60,11 @@ namespace cuda
 
             switch(method) {
                 case AF_INTERP_NEAREST:
-                    transform_n(optr, out, iptr, in, t.tmat, xx, yy, limages); break;
+                    transform_n(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break;
                 case AF_INTERP_BILINEAR:
-                    transform_b(optr, out, iptr, in, t.tmat, xx, yy, limages); break;
+                    transform_b(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break;
                 case AF_INTERP_LOWER:
-                    transform_l(optr, out, iptr, in, t.tmat, xx, yy, limages); break;
+                    transform_l(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break;
                 default: break;
             }
         }
diff --git a/src/backend/cuda/kernel/select.hpp b/src/backend/cuda/kernel/select.hpp
index ab5bf2da7b..ea242e45dd 100644
--- a/src/backend/cuda/kernel/select.hpp
+++ b/src/backend/cuda/kernel/select.hpp
@@ -41,7 +41,7 @@ namespace cuda
             const int idw = blockIdx.y / blk_y;
 
             const int blockIdx_x = blockIdx.x - idz * blk_x;
-            const int blockIdx_y = blockIdx.y - idz * blk_y;
+            const int blockIdx_y = blockIdx.y - idw * blk_y;
 
             const int idx = blockIdx_x * blockDim.x + threadIdx.x;
             const int idy = blockIdx_y * blockDim.y + threadIdx.y;
@@ -110,7 +110,7 @@ namespace cuda
             const int idw = blockIdx.y / blk_y;
 
             const int blockIdx_x = blockIdx.x - idz * blk_x;
-            const int blockIdx_y = blockIdx.y - idz * blk_y;
+            const int blockIdx_y = blockIdx.y - idw * blk_y;
 
             const int idx = blockIdx_x * blockDim.x + threadIdx.x;
             const int idy = blockIdx_y * blockDim.y + threadIdx.y;
diff --git a/src/backend/cuda/kernel/transform.hpp b/src/backend/cuda/kernel/transform.hpp
index 07be0a35b3..599e62cf9d 100644
--- a/src/backend/cuda/kernel/transform.hpp
+++ b/src/backend/cuda/kernel/transform.hpp
@@ -24,21 +24,42 @@ namespace cuda
         // Used for batching images
         static const unsigned TI = 4;
 
-        __constant__ float c_tmat[6 * 256];
+        __constant__ float c_tmat[9 * 256];
 
         template <typename T>
         __host__ __device__
-        void calc_affine_inverse(T *txo, const T *txi)
+        void calc_transf_inverse(T *txo, const T *txi, const bool perspective)
         {
-            T det = txi[0]*txi[4] - txi[1]*txi[3];
-
-            txo[0] = txi[4] / det;
-            txo[1] = txi[3] / det;
-            txo[3] = txi[1] / det;
-            txo[4] = txi[0] / det;
-
-            txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1];
-            txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4];
+            if (perspective) {
+                txo[0] =   txi[4]*txi[8] - txi[5]*txi[7];
+                txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]);
+                txo[2] =   txi[1]*txi[5] - txi[2]*txi[4];
+
+                txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]);
+                txo[4] =   txi[0]*txi[8] - txi[2]*txi[6];
+                txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]);
+
+                txo[6] =   txi[3]*txi[7] - txi[4]*txi[6];
+                txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]);
+                txo[8] =   txi[0]*txi[4] - txi[1]*txi[3];
+
+                T det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6];
+
+                txo[0] /= det; txo[1] /= det; txo[2] /= det;
+                txo[3] /= det; txo[4] /= det; txo[5] /= det;
+                txo[6] /= det; txo[7] /= det; txo[8] /= det;
+                }
+            else {
+                T det = txi[0]*txi[4] - txi[1]*txi[3];
+
+                txo[0] = txi[4] / det;
+                txo[1] = txi[3] / det;
+                txo[3] = txi[1] / det;
+                txo[4] = txi[0] / det;
+
+                txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1];
+                txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4];
+            }
         }
 
         ///////////////////////////////////////////////////////////////////////////
@@ -47,7 +68,8 @@ namespace cuda
         template<typename T, bool inverse, af_interp_type method>
         __global__ static void
         transform_kernel(Param<T> out, CParam<T> in, const int nimages,
-                         const int ntransforms, const int blocksXPerImage)
+                         const int ntransforms, const int blocksXPerImage,
+                         const int transf_len, const bool perspective)
         {
             // Compute which image set
             const int setId = blockIdx.x / blocksXPerImage;
@@ -77,30 +99,32 @@ namespace cuda
             const T *iptr = in.ptr  + setId * nimages * in.strides[2];
 
             // Transform is in constant memory.
-            const float *tmat_ptr = c_tmat + t_idx * 6;
-            float tmat[6];
+            const float *tmat_ptr = c_tmat + t_idx * transf_len;
+            float* tmat = new float[transf_len];
 
             // We expect a inverse transform matrix by default
             // If it is an forward transform, then we need its inverse
             if(inverse) {
-                #pragma unroll
-                for(int i = 0; i < 6; i++)
+                #pragma unroll 3
+                for(int i = 0; i < transf_len; i++)
                     tmat[i] = tmat_ptr[i];
             } else {
-                calc_affine_inverse(tmat, tmat_ptr);
+                calc_transf_inverse(tmat, tmat_ptr, perspective);
             }
 
             if (xido >= out.dims[0] && yido >= out.dims[1]) return;
 
             switch(method) {
                 case AF_INTERP_NEAREST:
-                    transform_n(optr, out, iptr, in, tmat, xido, yido, limages); break;
+                    transform_n(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break;
                 case AF_INTERP_BILINEAR:
-                    transform_b(optr, out, iptr, in, tmat, xido, yido, limages); break;
+                    transform_b(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break;
                 case AF_INTERP_LOWER:
-                    transform_l(optr, out, iptr, in, tmat, xido, yido, limages); break;
+                    transform_l(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break;
                 default: break;
             }
+
+            delete[] tmat;
         }
 
         ///////////////////////////////////////////////////////////////////////////
@@ -108,15 +132,18 @@ namespace cuda
         ///////////////////////////////////////////////////////////////////////////
         template <typename T, af_interp_type method>
         void transform(Param<T> out, CParam<T> in, CParam<float> tf,
-                       const bool inverse)
+                       const bool inverse, const bool perspective)
         {
             int nimages = in.dims[2];
             // Multiplied in src/backend/transform.cpp
             const int ntransforms = out.dims[2] / in.dims[2];
 
+
+            const int transf_len = (perspective) ? 9 : 6;
+
             // Copy transform to constant memory.
-            CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
-                                          cudaMemcpyDeviceToDevice,
+            CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * transf_len * sizeof(float),
+                                          0, cudaMemcpyDeviceToDevice,
                                           cuda::getStream(cuda::getActiveDeviceId())));
 
             dim3 threads(TX, TY, 1);
@@ -133,10 +160,12 @@ namespace cuda
 
             if(inverse) {
                 CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads,
-                                out, in, nimages, ntransforms, blocksXPerImage);
+                                out, in, nimages, ntransforms, blocksXPerImage,
+                                transf_len, perspective);
             } else {
                 CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads,
-                                out, in, nimages, ntransforms, blocksXPerImage);
+                                out, in, nimages, ntransforms, blocksXPerImage,
+                                transf_len, perspective);
             }
             POST_LAUNCH_CHECK();
         }
diff --git a/src/backend/cuda/kernel/transform_interp.hpp b/src/backend/cuda/kernel/transform_interp.hpp
index 5a88fc4d76..1554b8ec62 100644
--- a/src/backend/cuda/kernel/transform_interp.hpp
+++ b/src/backend/cuda/kernel/transform_interp.hpp
@@ -42,15 +42,28 @@ namespace cuda
         template<typename T>
         __device__
         void transform_n(T *optr, Param<T> out, const T *iptr, CParam<T> in, const float *tmat,
-                         const int xido, const int yido, const int nimages)
+                         const int xido, const int yido, const int nimages,
+                         const bool perspective)
         {
             // Compute input index
-            int xidi = round(xido * tmat[0]
+            int xidi = 0, yidi = 0;
+            if (perspective) {
+                const float W = xido * tmat[6] + yido * tmat[7] + tmat[8];
+                xidi = round((xido * tmat[0]
+                            + yido * tmat[1]
+                                   + tmat[2]) / W);
+                yidi = round((xido * tmat[3]
+                            + yido * tmat[4]
+                                   + tmat[5]) / W);
+            }
+            else {
+                xidi = round(xido * tmat[0]
                            + yido * tmat[1]
                                   + tmat[2]);
-            int yidi = round(xido * tmat[3]
+                yidi = round(xido * tmat[3]
                            + yido * tmat[4]
                                   + tmat[5]);
+            }
 
             // Makes scale give same output as resize
             // But fails rotate tests
@@ -76,17 +89,30 @@ namespace cuda
         template<typename T>
         __device__
         void transform_b(T *optr, Param<T> out, const T *iptr, CParam<T> in, const float *tmat,
-                         const int xido, const int yido, const int nimages)
+                         const int xido, const int yido, const int nimages,
+                         const bool perspective)
         {
             const int loco = (yido * out.strides[1] + xido);
 
             // Compute input index
-            const float xidi = xido * tmat[0]
-                             + yido * tmat[1]
-                                    + tmat[2];
-            const float yidi = xido * tmat[3]
-                             + yido * tmat[4]
-                                    + tmat[5];
+            float xidi = 0.0f, yidi = 0.0f;
+            if (perspective) {
+                const float W = xido * tmat[6] + yido * tmat[7] + tmat[8];
+                xidi = (xido * tmat[0]
+                      + yido * tmat[1]
+                             + tmat[2]) / W;
+                yidi = (xido * tmat[3]
+                      + yido * tmat[4]
+                             + tmat[5]) / W;
+            }
+            else {
+                xidi = xido * tmat[0]
+                     + yido * tmat[1]
+                            + tmat[2];
+                yidi = xido * tmat[3]
+                     + yido * tmat[4]
+                            + tmat[5];
+            }
 
             if (xidi < -0.0001 || yidi < -0.0001 || in.dims[0] < xidi || in.dims[1] < yidi) {
                 for(int i = 0; i < nimages; i++) {
@@ -133,15 +159,28 @@ namespace cuda
         template<typename T>
         __device__
         void transform_l(T *optr, Param<T> out, const T *iptr, CParam<T> in, const float *tmat,
-                         const int xido, const int yido, const int nimages)
+                         const int xido, const int yido, const int nimages,
+                         const bool perspective)
         {
             // Compute input index
-            int xidi = floor(xido * tmat[0]
+            int xidi = 0, yidi = 0;
+            if (perspective) {
+                const float W = xido * tmat[6] + yido * tmat[7] + tmat[8];
+                xidi = floor((xido * tmat[0]
+                            + yido * tmat[1]
+                                   + tmat[2]) / W);
+                yidi = floor((xido * tmat[3]
+                            + yido * tmat[4]
+                                   + tmat[5]) / W);
+            }
+            else {
+                xidi = floor(xido * tmat[0]
                            + yido * tmat[1]
                                   + tmat[2]);
-            int yidi = floor(xido * tmat[3]
+                yidi = floor(xido * tmat[3]
                            + yido * tmat[4]
                                   + tmat[5]);
+            }
 
             // Makes scale give same output as resize
             // But fails rotate tests
diff --git a/src/backend/cuda/lu.cu b/src/backend/cuda/lu.cu
index 2a45d4b9f5..ce0b545a84 100644
--- a/src/backend/cuda/lu.cu
+++ b/src/backend/cuda/lu.cu
@@ -156,6 +156,11 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
     return pivot;
 }
 
+bool isLAPACKAvailable()
+{
+    return true;
+}
+
 #define INSTANTIATE_LU(T)                                                                           \
     template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
     template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
@@ -186,6 +191,11 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
     return cpu::lu_inplace(in, convert_pivot);
 }
 
+bool isLAPACKAvailable()
+{
+    return true;
+}
+
 #define INSTANTIATE_LU(T)                                                                           \
     template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
     template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
@@ -213,6 +223,11 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
              AF_ERR_NOT_CONFIGURED);
 }
 
+bool isLAPACKAvailable()
+{
+    return false;
+}
+
 #define INSTANTIATE_LU(T)                                                                           \
     template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
     template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
diff --git a/src/backend/cuda/lu.hpp b/src/backend/cuda/lu.hpp
index 0753129d6b..acf9dbaad7 100644
--- a/src/backend/cuda/lu.hpp
+++ b/src/backend/cuda/lu.hpp
@@ -17,4 +17,6 @@ namespace cuda
 
     template<typename T>
     Array<int> lu_inplace(Array<T> &in, const bool convert_pivot = true);
+
+    bool isLAPACKAvailable();
 }
diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp
index 9b3d731b4b..51eb507320 100644
--- a/src/backend/cuda/memory.cpp
+++ b/src/backend/cuda/memory.cpp
@@ -12,367 +12,233 @@
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
 #include <err_cuda.hpp>
+#include <util.hpp>
 #include <types.hpp>
+#include <iostream>
+#include <iomanip>
+#include <string>
 #include <map>
 #include <dispatch.hpp>
 #include <platform.hpp>
+#include <MemoryManager.hpp>
 
-namespace cuda
-{
-    static size_t memory_resolution = 1024; //1KB
-
-    void setMemStepSize(size_t step_bytes)
-    {
-        memory_resolution = step_bytes;
-    }
-
-    size_t getMemStepSize(void)
-    {
-        return memory_resolution;
-    }
-
-    template<typename T>
-    static void cudaFreeWrapper(T *ptr)
-    {
-        cudaError_t err = cudaFree(ptr);
-        if (err != cudaErrorCudartUnloading) // see issue #167
-            CUDA_CHECK(err);
-    }
-
-    template<typename T>
-    static void pinnedFreeWrapper(T *ptr)
-    {
-        cudaError_t err = cudaFreeHost(ptr);
-        if (err != cudaErrorCudartUnloading) // see issue #167
-            CUDA_CHECK(err);
-    }
-
-#ifdef AF_CUDA_MEM_DEBUG
 
-    template<typename T>
-    T* memAlloc(const size_t &elements)
-    {
-        T* ptr = NULL;
-        CUDA_CHECK(cudaMalloc(&ptr, elements * sizeof(T)));
-        return ptr;
-    }
-
-    template<typename T>
-    void memFree(T *ptr)
-    {
-        cudaFreeWrapper(ptr); // Free it because we are not sure what the size is
-    }
-
-    template<typename T>
-    void memPop(const T *ptr)
-    {
-        return;
-    }
-
-    template<typename T>
-    void memPush(const T *ptr)
-    {
-        return;
-    }
-
-    template<typename T>
-    T* pinnedAlloc(const size_t &elements)
-    {
-        T* ptr = NULL;
-        CUDA_CHECK(cudaMallocHost((void **)(&ptr), elements * sizeof(T)));
-        return (T*)ptr;
-    }
-
-    template<typename T>
-    void pinnedFree(T *ptr)
-    {
-        pinnedFreeWrapper(ptr); // Free it because we are not sure what the size is
-    }
-
-    void garbageCollect()
-    {
-    }
+#ifndef AF_MEM_DEBUG
+#define AF_MEM_DEBUG 0
+#endif
 
-    void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
-                          size_t *lock_bytes,  size_t *lock_buffers)
-    {
-    }
+#ifndef AF_CUDA_MEM_DEBUG
+#define AF_CUDA_MEM_DEBUG 0
+#endif
 
-#else
+namespace cuda
+{
 
-    // Manager Class
-    // Dummy used to call garbage collection at the end of the program
-    class Manager
+class MemoryManager  : public common::MemoryManager
+{
+    int getActiveDeviceId();
+    size_t getMaxMemorySize(int id);
+public:
+    MemoryManager();
+    void *nativeAlloc(const size_t bytes);
+    void nativeFree(void *ptr);
+    ~MemoryManager()
     {
-        public:
-        static bool initialized;
-        Manager()
-        {
-            initialized = true;
-        }
-
-        ~Manager()
-        {
-            // Destructors should not through exceptions
+        common::lock_guard_t lock(this->memory_mutex);
+        for (int n = 0; n < getDeviceCount(); n++) {
             try {
-                for(int i = 0; i < getDeviceCount(); i++) {
-                    setDevice(i);
-                    garbageCollect();
-                }
-                pinnedGarbageCollect();
-
-            } catch (AfError &ex) {
-
-                const char* perr = getenv("AF_PRINT_ERRORS");
-                if(perr && perr[0] != '0') {
-                    fprintf(stderr, "%s\n", ex.what());
-                }
+                cuda::setDevice(n);
+                this->garbageCollect();
+            } catch(AfError err) {
+                continue; // Do not throw any errors while shutting down
             }
         }
-    };
-
-    bool Manager::initialized = false;
-
-    static void managerInit()
-    {
-        if(Manager::initialized == false)
-            static Manager pm = Manager();
     }
+};
 
-    typedef struct
-    {
-        bool is_free;
-        bool is_unlinked;
-        size_t bytes;
-    } mem_info;
-
-    static size_t used_bytes[DeviceManager::MAX_DEVICES] = {0};
-    static size_t used_buffers[DeviceManager::MAX_DEVICES] = {0};
-    static size_t total_bytes[DeviceManager::MAX_DEVICES] = {0};
-    typedef std::map<void *, mem_info> mem_t;
-    typedef mem_t::iterator mem_iter;
-
-    mem_t memory_maps[DeviceManager::MAX_DEVICES];
-
-    void garbageCollect()
+// CUDA Pinned Memory does not depend on device
+// So we pass 1 as numDevices to the constructor so that it creates 1 vector
+// of memory_info
+// When allocating and freeing, it doesn't really matter which device is active
+class MemoryManagerPinned  : public common::MemoryManager
+{
+    int getActiveDeviceId();
+    size_t getMaxMemorySize(int id);
+public:
+    MemoryManagerPinned();
+    void *nativeAlloc(const size_t bytes);
+    void nativeFree(void *ptr);
+    ~MemoryManagerPinned()
     {
-        int n = getActiveDeviceId();
-
-        for(mem_iter iter = memory_maps[n].begin();
-            iter != memory_maps[n].end(); ++iter) {
-
-            if ((iter->second).is_free) {
-
-                if (!(iter->second).is_unlinked) {
-                    cudaFreeWrapper(iter->first);
-                    total_bytes[n] -= iter->second.bytes;
-                }
-            }
-        }
-
-        mem_iter memory_curr = memory_maps[n].begin();
-        mem_iter memory_end  = memory_maps[n].end();
-
-        while(memory_curr != memory_end) {
-            if (memory_curr->second.is_free  && !memory_curr->second.is_unlinked) {
-                memory_maps[n].erase(memory_curr++);
-            } else {
-                ++memory_curr;
-            }
-        }
+        common::lock_guard_t lock(this->memory_mutex);
+        this->garbageCollect();
     }
+};
 
-    template<typename T>
-    T* memAlloc(const size_t &elements)
-    {
-        managerInit();
-        int n = getActiveDeviceId();
-        T* ptr = NULL;
-        size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution;
-
-        if (elements > 0) {
-
-            // FIXME: Add better checks for garbage collection
-            // Perhaps look at total memory available as a metric
-            if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes[n] >= MAX_BYTES) {
-                garbageCollect();
-            }
-
-            for(mem_iter iter = memory_maps[n].begin();
-                iter != memory_maps[n].end(); ++iter) {
-
-                mem_info info = iter->second;
+int MemoryManager::getActiveDeviceId()
+{
+    return cuda::getActiveDeviceId();
+}
 
-                if (  info.is_free &&
-                     !info.is_unlinked &&
-                      info.bytes == alloc_bytes) {
+size_t MemoryManager::getMaxMemorySize(int id)
+{
+    return cuda::getDeviceMemorySize(id);
+}
 
-                    iter->second.is_free = false;
-                    used_bytes[n] += alloc_bytes;
-                    used_buffers[n]++;
-                    return (T *)iter->first;
-                }
-            }
+MemoryManager::MemoryManager() :
+    common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG)
+{
+    this->setMaxMemorySize();
+}
 
-            // Perform garbage collection if memory can not be allocated
-            if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) {
-                garbageCollect();
-                CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes));
-            }
+void *MemoryManager::nativeAlloc(const size_t bytes)
+{
+    void *ptr = NULL;
+    CUDA_CHECK(cudaMalloc(&ptr, bytes));
+    return ptr;
+}
 
-            mem_info info = {false, false, alloc_bytes};
-            memory_maps[n][ptr] = info;
-            used_bytes[n] += alloc_bytes;
-            used_buffers[n]++;
-            total_bytes[n] += alloc_bytes;
-        }
-        return ptr;
+void MemoryManager::nativeFree(void *ptr)
+{
+    cudaError_t err = cudaFree(ptr);
+    if (err != cudaErrorCudartUnloading) {
+        CUDA_CHECK(err);
     }
+}
 
-    template<typename T>
-    void memFree(T *ptr)
-    {
-        int n = getActiveDeviceId();
-        mem_iter iter = memory_maps[n].find((void *)ptr);
-
-        if (iter != memory_maps[n].end()) {
+static MemoryManager &getMemoryManager()
+{
+    static MemoryManager instance;
+    return instance;
+}
 
-            iter->second.is_free = true;
-            if ((iter->second).is_unlinked) return;
+int MemoryManagerPinned::getActiveDeviceId()
+{
+    return 0; // pinned uses a single vector
+}
 
-            used_bytes[n] -= iter->second.bytes;
-            used_buffers[n]--;
+size_t MemoryManagerPinned::getMaxMemorySize(int id)
+{
+    return cuda::getHostMemorySize();
+}
 
-        } else {
-            cudaFreeWrapper(ptr); // Free it because we are not sure what the size is
-        }
-    }
+MemoryManagerPinned::MemoryManagerPinned() :
+    common::MemoryManager(1, common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG)
+{
+    this->setMaxMemorySize();
+}
 
-    template<typename T>
-    void memPop(const T *ptr)
-    {
-        int n = getActiveDeviceId();
-        mem_iter iter = memory_maps[n].find((void *)ptr);
+void *MemoryManagerPinned::nativeAlloc(const size_t bytes)
+{
+    void *ptr;
+    CUDA_CHECK(cudaMallocHost(&ptr, bytes));
+    return ptr;
+}
 
-        if (iter != memory_maps[n].end()) {
-            iter->second.is_unlinked = true;
-        } else {
+void MemoryManagerPinned::nativeFree(void *ptr)
+{
+    cudaError_t err = cudaFreeHost(ptr);
+    if (err != cudaErrorCudartUnloading) {
+        CUDA_CHECK(err);
+    }
+}
 
-            mem_info info = { false,
-                              true,
-                              100 }; //This number is not relevant
+static MemoryManagerPinned &getMemoryManagerPinned()
+{
+    static MemoryManagerPinned instance;
+    return instance;
+}
 
-            memory_maps[n][(void *)ptr] = info;
-        }
-    }
+void setMemStepSize(size_t step_bytes)
+{
+    getMemoryManager().setMemStepSize(step_bytes);
+}
 
-    template<typename T>
-    void memPush(const T *ptr)
-    {
-        int n = getActiveDeviceId();
-        mem_iter iter = memory_maps[n].find((void *)ptr);
-        if (iter != memory_maps[n].end()) {
-            iter->second.is_unlinked = false;
-        }
-    }
+size_t getMemStepSize(void)
+{
+    return getMemoryManager().getMemStepSize();
+}
 
-    void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
-                          size_t *lock_bytes,  size_t *lock_buffers)
-    {
-        int n = getActiveDeviceId();
-        if (alloc_bytes   ) *alloc_bytes   = total_bytes[n];
-        if (alloc_buffers ) *alloc_buffers = memory_maps[n].size();
-        if (lock_bytes    ) *lock_bytes    = used_bytes[n];
-        if (lock_buffers  ) *lock_buffers  = used_buffers[n];
-    }
+size_t getMaxBytes()
+{
+    return getMemoryManager().getMaxBytes();
+}
 
-    //////////////////////////////////////////////////////////////////////////////
-    mem_t pinned_maps;
-    static size_t pinned_used_bytes = 0;
+unsigned getMaxBuffers()
+{
+    return getMemoryManager().getMaxBuffers();
+}
 
-    void pinnedGarbageCollect()
-    {
-        for(mem_iter iter = pinned_maps.begin(); iter != pinned_maps.end(); ++iter) {
-            if ((iter->second).is_free) {
-                pinnedFreeWrapper(iter->first);
-            }
-        }
+void garbageCollect()
+{
+    getMemoryManager().garbageCollect();
+}
 
-        mem_iter memory_curr = pinned_maps.begin();
-        mem_iter memory_end  = pinned_maps.end();
+void printMemInfo(const char *msg, const int device)
+{
+    getMemoryManager().printInfo(msg, device);
+}
 
-        while(memory_curr != memory_end) {
-            if (memory_curr->second.is_free) {
-                pinned_maps.erase(memory_curr++);
-            } else {
-                ++memory_curr;
-            }
-        }
-    }
+template<typename T>
+T* memAlloc(const size_t &elements)
+{
+    return (T *)getMemoryManager().alloc(elements * sizeof(T), false);
+}
 
-    template<typename T>
-    T* pinnedAlloc(const size_t &elements)
-    {
-        managerInit();
-        T* ptr = NULL;
-        // Allocate the higher megabyte. Overhead of creating pinned memory is
-        // more so we want more resuable memory.
-        size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;
-
-        if (elements > 0) {
-
-            // FIXME: Add better checks for garbage collection
-            // Perhaps look at total memory available as a metric
-            if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
-                pinnedGarbageCollect();
-            }
+void* memAllocUser(const size_t &bytes)
+{
+    return getMemoryManager().alloc(bytes, true);
+}
+template<typename T>
+void memFree(T *ptr)
+{
+    return getMemoryManager().unlock((void *)ptr, false);
+}
 
-            for(mem_iter iter = pinned_maps.begin();
-                iter != pinned_maps.end(); ++iter) {
+void memFreeUser(void *ptr)
+{
+    getMemoryManager().unlock((void *)ptr, true);
+}
 
-                mem_info info = iter->second;
-                if (info.is_free && info.bytes == alloc_bytes) {
-                    iter->second.is_free = false;
-                    pinned_used_bytes += alloc_bytes;
-                    return (T *)iter->first;
-                }
-            }
+void memLock(const void *ptr)
+{
+    getMemoryManager().userLock((void *)ptr);
+}
 
-            // Perform garbage collection if memory can not be allocated
-            if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {
-                pinnedGarbageCollect();
-                CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));
-            }
+void memUnlock(const void *ptr)
+{
+    getMemoryManager().userUnlock((void *)ptr);
+}
 
-            mem_info info = {false, false, alloc_bytes};
-            pinned_maps[ptr] = info;
-            pinned_used_bytes += alloc_bytes;
-        }
-        return (T*)ptr;
-    }
+void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
+                      size_t *lock_bytes,  size_t *lock_buffers)
+{
+    getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers,
+                                  lock_bytes,  lock_buffers);
+}
 
-    template<typename T>
-    void pinnedFree(T *ptr)
-    {
-        mem_iter iter = pinned_maps.find((void *)ptr);
+template<typename T>
+T* pinnedAlloc(const size_t &elements)
+{
+    return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T), false);
+}
 
-        if (iter != pinned_maps.end()) {
-            iter->second.is_free = true;
-            pinned_used_bytes -= iter->second.bytes;
-        } else {
-            pinnedFreeWrapper(ptr); // Free it because we are not sure what the size is
-        }
-    }
+template<typename T>
+void pinnedFree(T* ptr)
+{
+    return getMemoryManagerPinned().unlock((void *)ptr, false);
+}
 
-#endif
+bool checkMemoryLimit()
+{
+    return getMemoryManager().checkMemoryLimit();
+}
 
-#define INSTANTIATE(T)                                  \
-    template T* memAlloc(const size_t &elements);       \
-    template void memFree(T* ptr);                      \
-    template void memPop(const T* ptr);                 \
-    template void memPush(const T* ptr);                \
-    template T* pinnedAlloc(const size_t &elements);    \
-    template void pinnedFree(T* ptr);                   \
+#define INSTANTIATE(T)                                      \
+    template T* memAlloc(const size_t &elements);           \
+    template void memFree(T* ptr);                          \
+    template T* pinnedAlloc(const size_t &elements);        \
+    template void pinnedFree(T* ptr);                       \
 
     INSTANTIATE(float)
     INSTANTIATE(cfloat)
diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp
index 2e5fef2593..80478c13dc 100644
--- a/src/backend/cuda/memory.hpp
+++ b/src/backend/cuda/memory.hpp
@@ -9,24 +9,36 @@
 #pragma once
 
 #include <af/defines.h>
+
 namespace cuda
 {
     template<typename T> T* memAlloc(const size_t &elements);
+    void *memAllocUser(const size_t &bytes);
+
+    // Need these as 2 separate function and not a default argument
+    // This is because it is used as the deleter in shared pointer
+    // which cannot support default arguments
     template<typename T> void memFree(T* ptr);
-    template<typename T> void memPop(const T *ptr);
-    template<typename T> void memPush(const T *ptr);
+    void memFreeUser(void* ptr);
+
+    void memLock(const void *ptr);
+    void memUnlock(const void *ptr);
 
     template<typename T> T* pinnedAlloc(const size_t &elements);
     template<typename T> void pinnedFree(T* ptr);
 
-    static const unsigned MAX_BUFFERS   = 100;
-    static const unsigned MAX_BYTES     = (1 << 30);
+    size_t getMaxBytes();
+    unsigned getMaxBuffers();
 
     void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
                           size_t *lock_bytes,  size_t *lock_buffers);
     void garbageCollect();
     void pinnedGarbageCollect();
 
+    void printMemInfo(const char *msg, const int device);
+
     void setMemStepSize(size_t step_bytes);
     size_t getMemStepSize(void);
+
+    bool checkMemoryLimit();
 }
diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp
index 38e8a04d16..10cfdc886c 100644
--- a/src/backend/cuda/platform.cpp
+++ b/src/backend/cuda/platform.cpp
@@ -11,6 +11,7 @@
 #include <af/cuda.h>
 #include <platform.hpp>
 #include <defines.hpp>
+#include <util.hpp>
 #include <version.hpp>
 #include <driver.h>
 #include <vector>
@@ -21,6 +22,8 @@
 #include <cstdio>
 #include <cstring>
 #include <err_cuda.hpp>
+#include <util.hpp>
+#include <host_memory.hpp>
 
 using namespace std;
 
@@ -60,13 +63,13 @@ static inline int compute2cores(int major, int minor)
     return 0;
 }
 
-// compare two cards based on (in order):
-//   1. flops (theoretical)
-//   2. total memory
-
+// Return true if greater, false if lesser.
+// if equal, it continues to next comparison
 #define COMPARE(a,b,f) do {                     \
-        return ((a)->f >= (b)->f);              \
-    } while (0);
+        if ((a)->f > (b)->f) return true;       \
+        if ((a)->f < (b)->f) return false;      \
+        break;                                  \
+    } while (0)
 
 
 static inline bool card_compare_compute(const cudaDevice_t &l, const cudaDevice_t &r)
@@ -79,7 +82,7 @@ static inline bool card_compare_compute(const cudaDevice_t &l, const cudaDevice_
     COMPARE(lc, rc, flops);
     COMPARE(lc, rc, prop.totalGlobalMem);
     COMPARE(lc, rc, nativeId);
-    return 0;
+    return false;
 }
 
 static inline bool card_compare_flops(const cudaDevice_t &l, const cudaDevice_t &r)
@@ -92,7 +95,7 @@ static inline bool card_compare_flops(const cudaDevice_t &l, const cudaDevice_t
     COMPARE(lc, rc, prop.major);
     COMPARE(lc, rc, prop.minor);
     COMPARE(lc, rc, nativeId);
-    return 0;
+    return false;
 }
 
 static inline bool card_compare_mem(const cudaDevice_t &l, const cudaDevice_t &r)
@@ -105,7 +108,7 @@ static inline bool card_compare_mem(const cudaDevice_t &l, const cudaDevice_t &r
     COMPARE(lc, rc, prop.major);
     COMPARE(lc, rc, prop.minor);
     COMPARE(lc, rc, nativeId);
-    return 0;
+    return false;
 }
 
 static inline bool card_compare_num(const cudaDevice_t &l, const cudaDevice_t &r)
@@ -114,7 +117,7 @@ static inline bool card_compare_num(const cudaDevice_t &l, const cudaDevice_t &r
     const cudaDevice_t *rc = &r;
 
     COMPARE(lc, rc, nativeId);
-    return 0;
+    return false;
 }
 
 static const std::string get_system(void)
@@ -147,18 +150,6 @@ int getBackend()
     return AF_BACKEND_CUDA;
 }
 
-string getInfo()
-{
-    ostringstream info;
-    info << "ArrayFire v" << AF_VERSION
-         << " (CUDA, " << get_system() << ", build " << AF_REVISION << ")" << std::endl;
-    info << getPlatformInfo();
-    for (int i = 0; i < getDeviceCount(); ++i) {
-        info << getDeviceInfo(i);
-    }
-    return info.str();
-}
-
 string getDeviceInfo(int device)
 {
     cudaDeviceProp dev = getDeviceProp(device);
@@ -183,6 +174,18 @@ string getDeviceInfo(int device)
     return info;
 }
 
+string getDeviceInfo()
+{
+    ostringstream info;
+    info << "ArrayFire v" << AF_VERSION
+         << " (CUDA, " << get_system() << ", build " << AF_REVISION << ")" << std::endl;
+    info << getPlatformInfo();
+    for (int i = 0; i < getDeviceCount(); ++i) {
+        info << getDeviceInfo(i);
+    }
+    return info.str();
+}
+
 string getPlatformInfo()
 {
     string driverVersion = getDriverVersion();
@@ -211,7 +214,7 @@ void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute)
     cudaDeviceProp dev = getDeviceProp(getActiveDeviceId());
 
     // Name
-    snprintf(d_name, 32, "%s", dev.name);
+    snprintf(d_name, 64, "%s", dev.name);
 
     //Platform
     std::string cudaRuntime = getCUDARuntimeVersion();
@@ -222,7 +225,7 @@ void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute)
     snprintf(d_compute, 10, "%d.%d", dev.major, dev.minor);
 
     // Sanitize input
-    for (int i = 0; i < 31; i++) {
+    for (int i = 0; i < 63; i++) {
         if (d_name[i] == ' ') {
             if (d_name[i + 1] == 0 || d_name[i + 1] == ' ') d_name[i] = 0;
             else d_name[i] = '_';
@@ -258,6 +261,23 @@ string getCUDARuntimeVersion()
 
 }
 
+unsigned getMaxJitSize()
+{
+    const int MAX_JIT_LEN = 20;
+
+    static int length = 0;
+    if (length == 0) {
+        std::string env_var = getEnvVar("AF_CUDA_MAX_JIT_LEN");
+        if (!env_var.empty()) {
+            length = std::stoi(env_var);
+        } else {
+            length = MAX_JIT_LEN;
+        }
+    }
+
+    return length;
+}
+
 int getDeviceCount()
 {
     return DeviceManager::getInstance().nDevices;
@@ -302,6 +322,16 @@ cudaStream_t getStream(int device)
     return str;
 }
 
+size_t getDeviceMemorySize(int device)
+{
+    return getDeviceProp(device).totalGlobalMem;
+}
+
+size_t getHostMemorySize()
+{
+    return common::getHostMemorySize();
+}
+
 int setDevice(int device)
 {
     return DeviceManager::getInstance().setActiveDevice(device);
@@ -347,8 +377,8 @@ DeviceManager::DeviceManager()
     for(int i = 0; i < (int)MAX_DEVICES; i++)
         streams[i] = (cudaStream_t)0;
 
-    const char* deviceENV = getenv("AF_CUDA_DEFAULT_DEVICE");
-    if(!deviceENV) {
+    std::string deviceENV = getEnvVar("AF_CUDA_DEFAULT_DEVICE");
+    if(deviceENV.empty()) {
         setActiveDevice(0, cuDevices[0].nativeId);
     } else {
         stringstream s(deviceENV);
@@ -368,36 +398,81 @@ void DeviceManager::sortDevices(sort_mode mode)
 {
     switch(mode) {
         case memory :
-            sort(cuDevices.begin(), cuDevices.end(), card_compare_mem);
+            std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_mem);
             break;
         case flops :
-            sort(cuDevices.begin(), cuDevices.end(), card_compare_flops);
+            std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_flops);
             break;
         case compute :
-            sort(cuDevices.begin(), cuDevices.end(), card_compare_compute);
+            std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_compute);
             break;
         case none : default :
-            sort(cuDevices.begin(), cuDevices.end(), card_compare_num);
+            std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_num);
             break;
     }
 }
 
 int DeviceManager::setActiveDevice(int device, int nId)
 {
-    if(device > (int)cuDevices.size()) {
-        return -1;
-    } else {
-        int old = activeDev;
-        if(nId == -1) nId = getDeviceNativeId(device);
-        CUDA_CHECK(cudaSetDevice(nId));
-        activeDev = device;
+    static bool first = true;
 
-        if(!streams[device]) {
-            CUDA_CHECK(cudaStreamCreate(&streams[device]));
-        }
+    int numDevices = cuDevices.size();
+
+    if(device > numDevices) return -1;
+
+    int old = activeDev;
+    if(nId == -1) nId = getDeviceNativeId(device);
+    CUDA_CHECK(cudaSetDevice(nId));
 
+    cudaError_t err = cudaSuccess;
+    if(!streams[device])
+        err = cudaStreamCreate(&streams[device]);
+
+    activeDev = device;
+
+    if (err == cudaSuccess) return old;
+
+    // Comes when user sets device
+    // If success, return. Else throw error
+    if (!first) {
+        CUDA_CHECK(err);
         return old;
     }
+
+    // Comes only when first is true. Set it to false
+    first = false;
+
+    while(true) {
+        // Check for errors other than DevicesUnavailable
+        // If success, return. Else throw error
+        // If DevicesUnavailable, try other devices (while loop below)
+        if (err != cudaErrorDevicesUnavailable) {
+            CUDA_CHECK(err);
+            activeDev = device;
+            return old;
+        }
+        cudaGetLastError(); // Reset error stack
+#ifndef NDEBUG
+        printf("Warning: Device %d is unavailable. Incrementing to next device \n", device);
+#endif
+
+        // Comes here is the device is in exclusive mode or
+        // otherwise fails streamCreate with this error.
+        // All other errors will error out
+        device++;
+        if (device >= numDevices) break;
+
+        // Can't call getNativeId here as it will cause an infinite loop with the constructor
+        nId = cuDevices[device].nativeId;
+
+        CUDA_CHECK(cudaSetDevice(nId));
+        err = cudaStreamCreate(&streams[device]);
+    }
+
+    // If all devices fail with DevicesUnavailable, then throw this error
+    CUDA_CHECK(err);
+
+    return old;
 }
 
 void sync(int device)
@@ -408,6 +483,11 @@ void sync(int device)
     setDevice(currDevice);
 }
 
+bool synchronize_calls() {
+    static bool sync = getEnvVar("AF_SYNCHRONOUS_CALLS") == "1";
+    return sync;
+}
+
 }
 
 af_err afcu_get_stream(cudaStream_t* stream, int id)
diff --git a/src/backend/cuda/platform.hpp b/src/backend/cuda/platform.hpp
index 7b649686dc..3fcc67ea5b 100644
--- a/src/backend/cuda/platform.hpp
+++ b/src/backend/cuda/platform.hpp
@@ -22,8 +22,7 @@ namespace cuda
 
 int getBackend();
 
-std::string getInfo();
-
+std::string getDeviceInfo();
 std::string getDeviceInfo(int device);
 
 std::string getPlatformInfo();
@@ -32,12 +31,12 @@ std::string getDriverVersion();
 
 std::string getCUDARuntimeVersion();
 
-std::string getInfo();
-
 bool isDoubleSupported(int device);
 
 void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute);
 
+unsigned getMaxJitSize();
+
 int getDeviceCount();
 
 int getActiveDeviceId();
@@ -46,10 +45,17 @@ int getDeviceNativeId(int device);
 
 cudaStream_t getStream(int device);
 
+size_t getDeviceMemorySize(int device);
+
+size_t getHostMemorySize();
+
 int setDevice(int device);
 
 void sync(int device);
 
+// Returns true if the AF_SYNCHRONIZE_CALLS environment variable is set to 1
+bool synchronize_calls();
+
 cudaDeviceProp getDeviceProp(int device);
 
 struct cudaDevice_t {
@@ -73,7 +79,7 @@ class DeviceManager
 
         friend std::string getCUDARuntimeVersion();
 
-        friend std::string getInfo();
+        friend std::string getDeviceInfo();
 
         friend int getDeviceCount();
 
diff --git a/src/backend/cuda/set.cu b/src/backend/cuda/set.cu
index 63501d3f2a..4629b8b3dc 100644
--- a/src/backend/cuda/set.cu
+++ b/src/backend/cuda/set.cu
@@ -32,7 +32,7 @@ namespace cuda
         Array<T> out = copyArray<T>(in);
 
         thrust::device_ptr<T> out_ptr = thrust::device_pointer_cast<T>(out.get());
-        thrust::device_ptr<T> out_ptr_end = out_ptr + out.dims()[0];
+        thrust::device_ptr<T> out_ptr_end = out_ptr + out.elements();
 
         if(!is_sorted) THRUST_SELECT(thrust::sort, out_ptr, out_ptr_end);
         thrust::device_ptr<T> out_ptr_last;
@@ -55,14 +55,14 @@ namespace cuda
             unique_second = setUnique(second, false);
         }
 
-        dim_t out_size = unique_first.dims()[0] + unique_second.dims()[0];
+        dim_t out_size = unique_first.elements() + unique_second.elements();
         Array<T> out = createEmptyArray<T>(dim4(out_size));
 
         thrust::device_ptr<T> first_ptr = thrust::device_pointer_cast<T>(unique_first.get());
-        thrust::device_ptr<T> first_ptr_end = first_ptr + unique_first.dims()[0];
+        thrust::device_ptr<T> first_ptr_end = first_ptr + unique_first.elements();
 
         thrust::device_ptr<T> second_ptr = thrust::device_pointer_cast<T>(unique_second.get());
-        thrust::device_ptr<T> second_ptr_end = second_ptr + unique_second.dims()[0];
+        thrust::device_ptr<T> second_ptr_end = second_ptr + unique_second.elements();
 
         thrust::device_ptr<T> out_ptr = thrust::device_pointer_cast<T>(out.get());
 
@@ -87,14 +87,14 @@ namespace cuda
             unique_second = setUnique(second, false);
         }
 
-        dim_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]);
+        dim_t out_size = std::max(unique_first.elements(), unique_second.elements());
         Array<T> out = createEmptyArray<T>(dim4(out_size));
 
         thrust::device_ptr<T> first_ptr = thrust::device_pointer_cast<T>(unique_first.get());
-        thrust::device_ptr<T> first_ptr_end = first_ptr + unique_first.dims()[0];
+        thrust::device_ptr<T> first_ptr_end = first_ptr + unique_first.elements();
 
         thrust::device_ptr<T> second_ptr = thrust::device_pointer_cast<T>(unique_second.get());
-        thrust::device_ptr<T> second_ptr_end = second_ptr + unique_second.dims()[0];
+        thrust::device_ptr<T> second_ptr_end = second_ptr + unique_second.elements();
 
         thrust::device_ptr<T> out_ptr = thrust::device_pointer_cast<T>(out.get());
 
diff --git a/src/backend/cuda/sift.cu b/src/backend/cuda/sift.cu
index f3d36d7dfb..ad668af924 100644
--- a/src/backend/cuda/sift.cu
+++ b/src/backend/cuda/sift.cu
@@ -15,7 +15,7 @@
 #include <err_cuda.hpp>
 #include <handle.hpp>
 
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
 #include <kernel/sift_nonfree.hpp>
 #endif
 
@@ -34,7 +34,7 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
               const float img_scale, const float feature_ratio,
               const bool compute_GLOH)
 {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
     const dim4 dims = in.dims();
 
     unsigned nfeat_out;
diff --git a/src/backend/cuda/transform.cu b/src/backend/cuda/transform.cu
index 853617c0a4..07c312353c 100644
--- a/src/backend/cuda/transform.cu
+++ b/src/backend/cuda/transform.cu
@@ -16,7 +16,7 @@ namespace cuda
 {
     template<typename T>
     Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
-                        const af_interp_type method, const bool inverse)
+                        const af_interp_type method, const bool inverse, const bool perspective)
     {
         const af::dim4 idims = in.dims();
 
@@ -24,13 +24,13 @@ namespace cuda
 
         switch(method) {
             case AF_INTERP_NEAREST:
-                kernel::transform<T, AF_INTERP_NEAREST> (out, in, transform, inverse);
+                kernel::transform<T, AF_INTERP_NEAREST> (out, in, transform, inverse, perspective);
                 break;
             case AF_INTERP_BILINEAR:
-                kernel::transform<T, AF_INTERP_BILINEAR>(out, in, transform, inverse);
+                kernel::transform<T, AF_INTERP_BILINEAR>(out, in, transform, inverse, perspective);
                 break;
             case AF_INTERP_LOWER:
-                kernel::transform<T, AF_INTERP_LOWER>   (out, in, transform, inverse);
+                kernel::transform<T, AF_INTERP_LOWER>   (out, in, transform, inverse, perspective);
                 break;
             default:
                 AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
@@ -43,7 +43,7 @@ namespace cuda
 #define INSTANTIATE(T)                                                                      \
     template Array<T> transform(const Array<T> &in, const Array<float> &transform,          \
                                 const af::dim4 &odims, const af_interp_type method,         \
-                                const bool inverse);
+                                const bool inverse, const bool perspective);
 
     INSTANTIATE(float)
     INSTANTIATE(double)
diff --git a/src/backend/cuda/transform.hpp b/src/backend/cuda/transform.hpp
index eb3d71d097..316953d614 100644
--- a/src/backend/cuda/transform.hpp
+++ b/src/backend/cuda/transform.hpp
@@ -14,5 +14,6 @@ namespace cuda
 {
     template<typename T>
     Array<T> transform(const Array<T> &in, const Array<float> &tf, const af::dim4 &odims,
-                        const af_interp_type method, const bool inverse);
+                       const af_interp_type method, const bool inverse,
+                       const bool perspective);
 }
diff --git a/src/api/c/dispatch.cpp b/src/backend/dispatch.cpp
similarity index 100%
rename from src/api/c/dispatch.cpp
rename to src/backend/dispatch.cpp
diff --git a/src/api/c/dispatch.hpp b/src/backend/dispatch.hpp
similarity index 100%
rename from src/api/c/dispatch.hpp
rename to src/backend/dispatch.hpp
diff --git a/src/backend/host_memory.cpp b/src/backend/host_memory.cpp
new file mode 100644
index 0000000000..9b4f1e5f54
--- /dev/null
+++ b/src/backend/host_memory.cpp
@@ -0,0 +1,113 @@
+/*
+ * Author:  David Robert Nadeau
+ * Site:    http://NadeauSoftware.com/
+ * License: Creative Commons Attribution 3.0 Unported License
+ *          http://creativecommons.org/licenses/by/3.0/deed.en_US
+ * Source:  http://nadeausoftware.com/sites/NadeauSoftware.com/files/getMemorySize.c
+ */
+
+#include "host_memory.hpp"
+
+#if defined(_WIN32)
+#include <Windows.h>
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/param.h>
+
+#if defined(BSD)
+#include <sys/sysctl.h>
+#endif
+
+#else
+#define NOMEMORYSIZE
+#endif
+
+namespace common
+{
+
+#ifdef NOMEMORYSIZE
+size_t getHostMemorySize()
+{
+    return 0L; // Can't detect
+}
+
+#else
+
+/**
+ * Returns the size of physical memory (RAM) in bytes.
+ */
+size_t getHostMemorySize()
+{
+#if defined(_WIN32) && (defined(__CYGWIN__) || defined(__CYGWIN32__))
+    /* Cygwin under Windows. ------------------------------------ */
+    /* New 64-bit MEMORYSTATUSEX isn't available.  Use old 32.bit */
+    MEMORYSTATUS status;
+    status.dwLength = sizeof(status);
+    GlobalMemoryStatus( &status );
+    return (size_t)status.dwTotalPhys;
+
+#elif defined(_WIN32)
+    /* Windows. ------------------------------------------------- */
+    /* Use new 64-bit MEMORYSTATUSEX, not old 32-bit MEMORYSTATUS */
+    MEMORYSTATUSEX status;
+    status.dwLength = sizeof(status);
+    GlobalMemoryStatusEx( &status );
+    return (size_t)status.ullTotalPhys;
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+    /* UNIX variants. ------------------------------------------- */
+    /* Prefer sysctl() over sysconf() except sysctl() HW_REALMEM and HW_PHYSMEM */
+
+#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
+    int mib[2];
+    mib[0] = CTL_HW;
+#if defined(HW_MEMSIZE)
+    mib[1] = HW_MEMSIZE;        /* OSX. --------------------- */
+#elif defined(HW_PHYSMEM64)
+    mib[1] = HW_PHYSMEM64;      /* NetBSD, OpenBSD. --------- */
+#endif
+    int64_t size = 0;       /* 64-bit */
+    size_t len = sizeof( size );
+    if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 )
+        return (size_t)size;
+    return 0L;          /* Failed? */
+
+#elif defined(_SC_AIX_REALMEM)
+    /* AIX. ----------------------------------------------------- */
+    return (size_t)sysconf( _SC_AIX_REALMEM ) * (size_t)1024L;
+
+#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
+    /* FreeBSD, Linux, OpenBSD, and Solaris. -------------------- */
+    return (size_t)sysconf( _SC_PHYS_PAGES ) *
+        (size_t)sysconf( _SC_PAGESIZE );
+
+#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE)
+    /* Legacy. -------------------------------------------------- */
+    return (size_t)sysconf( _SC_PHYS_PAGES ) *
+        (size_t)sysconf( _SC_PAGE_SIZE );
+
+#elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM))
+    /* DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX. -------- */
+    int mib[2];
+    mib[0] = CTL_HW;
+#if defined(HW_REALMEM)
+    mib[1] = HW_REALMEM;        /* FreeBSD. ----------------- */
+#elif defined(HW_PYSMEM)
+    mib[1] = HW_PHYSMEM;        /* Others. ------------------ */
+#endif
+    unsigned int size = 0;      /* 32-bit */
+    size_t len = sizeof( size );
+    if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 )
+        return (size_t)size;
+    return 0L;          /* Failed? */
+#endif /* sysctl and sysconf variants */
+
+#else
+    return 0L;          /* Unknown OS. */
+#endif
+}
+
+#endif // NOMEMORYSIZE
+} // namespace common
diff --git a/src/backend/host_memory.hpp b/src/backend/host_memory.hpp
new file mode 100644
index 0000000000..5955cbfbd9
--- /dev/null
+++ b/src/backend/host_memory.hpp
@@ -0,0 +1,18 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <cstddef>
+
+namespace common
+{
+
+size_t getHostMemorySize();
+
+}
diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp
index 0860098c9f..002c1d5b82 100644
--- a/src/backend/opencl/Array.cpp
+++ b/src/backend/opencl/Array.cpp
@@ -16,41 +16,42 @@
 #include <memory.hpp>
 #include <platform.hpp>
 #include <cstddef>
+#include <af/opencl.h>
+#include <util.hpp>
+#include <MemoryManager.hpp>
 
 using af::dim4;
 
 namespace opencl
 {
-
-    const int MAX_JIT_LEN = 20;
     using JIT::BufferNode;
     using JIT::Node;
     using JIT::Node_ptr;
 
     template<typename T>
     Array<T>::Array(af::dim4 dims) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(bufferAlloc(info.elements() * sizeof(T)), bufferFree),
         data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
+        node(), ready(true), owner(true)
     {
     }
 
     template<typename T>
     Array<T>::Array(af::dim4 dims, JIT::Node_ptr n) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(),
         data_dims(dims),
-        node(n), offset(0), ready(false), owner(true)
+        node(n), ready(false), owner(true)
     {
     }
 
     template<typename T>
     Array<T>::Array(af::dim4 dims, const T * const in_data) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(bufferAlloc(info.elements()*sizeof(T)), bufferFree),
         data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
+        node(), ready(true), owner(true)
     {
         static_assert(std::is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
         static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
@@ -59,10 +60,10 @@ namespace opencl
 
     template<typename T>
     Array<T>::Array(af::dim4 dims, cl_mem mem, size_t src_offset, bool copy) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(copy ? bufferAlloc(info.elements() * sizeof(T)) : new cl::Buffer(mem), bufferFree),
         data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
+        node(), ready(true), owner(true)
     {
         if (copy) {
             clRetainMemObject(mem);
@@ -74,12 +75,11 @@ namespace opencl
     }
 
     template<typename T>
-    Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &stride) :
-        info(parent.getDevId(), dims, offsets, stride, (af_dtype)dtype_traits<T>::af_type),
+    Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim_t &offset_, const dim4 &stride) :
+        info(parent.getDevId(), dims, offset_, stride, (af_dtype)dtype_traits<T>::af_type),
         data(parent.getData()),
         data_dims(parent.getDataDims()),
         node(),
-        offset(parent.getOffset() + calcOffset(parent.strides(), offsets)),
         ready(true),
         owner(false)
     { }
@@ -87,15 +87,33 @@ namespace opencl
 
     template<typename T>
     Array<T>::Array(Param &tmp) :
-        info(getActiveDeviceId(), af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3]),
-                  af::dim4(0, 0, 0, 0),
-                  af::dim4(tmp.info.strides[0], tmp.info.strides[1],
-                           tmp.info.strides[2], tmp.info.strides[3]),
-                  (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(),
+             af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3]),
+             0,
+             af::dim4(tmp.info.strides[0], tmp.info.strides[1],
+                      tmp.info.strides[2], tmp.info.strides[3]),
+             (af_dtype)dtype_traits<T>::af_type),
         data(tmp.data, bufferFree),
         data_dims(af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3])),
-        node(), offset(0), ready(true), owner(true)
+        node(), ready(true), owner(true)
+    {
+    }
+
+    template<typename T>
+    Array<T>::Array(af::dim4 dims, af::dim4 strides, dim_t offset_,
+                    const T * const in_data, bool is_device) :
+        info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits<T>::af_type),
+        data(is_device ?
+             (new cl::Buffer((cl_mem)in_data)) :
+             (bufferAlloc(info.total() * sizeof(T))), bufferFree),
+        data_dims(dims),
+        node(),
+        ready(true),
+        owner(true)
     {
+        if (!is_device) {
+            getQueue().enqueueWriteBuffer(*data.get(), CL_TRUE, 0, sizeof(T) * info.total(), in_data);
+        }
     }
 
 
@@ -166,9 +184,9 @@ namespace opencl
         n->getInfo(length, buf_count, bytes);
         n->resetFlags();
 
-        if (length > MAX_JIT_LEN ||
-            buf_count >= MAX_BUFFERS ||
-            bytes >= MAX_BYTES) {
+        if (length > getMaxJitSize() ||
+            buf_count >= getMaxBuffers() ||
+            bytes >= getMaxBytes()) {
             out.eval();
         }
 
@@ -185,18 +203,23 @@ namespace opencl
         dim4 dDims = parent.getDataDims();
         dim4 pDims = parent.dims();
 
-        dim4 dims   = toDims  (index, pDims);
-        dim4 offset = toOffset(index, dDims);
-        dim4 stride = toStride (index, dDims);
+        dim4 dims    = toDims  (index, pDims);
+        dim4 strides = toStride (index, dDims);
 
-        Array<T> out = Array<T>(parent, dims, offset, stride);
+        // Find total offsets after indexing
+        dim4 offsets = toOffset(index, pDims);
+        dim4 parent_strides = parent.strides();
+        dim_t offset = parent.getOffset();
+        for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i];
+
+        Array<T> out = Array<T>(parent, dims, offset, strides);
 
         if (!copy) return out;
 
-        if (stride[0] != 1 ||
-            stride[1] <  0 ||
-            stride[2] <  0 ||
-            stride[3] <  0) {
+        if (strides[0] != 1 ||
+            strides[1] <  0 ||
+            strides[2] <  0 ||
+            strides[3] <  0) {
 
             out = copyArray(out);
         }
@@ -258,18 +281,12 @@ namespace opencl
         delete A;
     }
 
-    template<typename T>
-    void evalArray(const Array<T> &A)
-    {
-        A.eval();
-    }
-
     template<typename T>
     void
     writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
     {
         if (!arr.isOwner()) {
-            arr = createEmptyArray<T>(arr.dims());
+            arr = copyArray<T>(arr);
         }
 
         getQueue().enqueueWriteBuffer(*arr.get(), CL_TRUE,
@@ -285,7 +302,7 @@ namespace opencl
     writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
     {
         if (!arr.isOwner()) {
-            arr = createEmptyArray<T>(arr.dims());
+            arr = copyArray<T>(arr);
         }
 
         cl::Buffer& buf = *arr.get();
@@ -312,10 +329,13 @@ namespace opencl
                                                        const std::vector<af_seq> &index, \
                                                        bool copy);      \
     template       void      destroyArray<T>          (Array<T> *A);    \
-    template       void      evalArray<T>             (const Array<T> &A); \
     template       Array<T>  createNodeArray<T>       (const dim4 &size, JIT::Node_ptr node); \
+    template       Array<T>::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \
+                                   const T * const in_data,             \
+                                   bool is_device);                     \
     template       Array<T>::Array(af::dim4 dims, cl_mem mem, size_t src_offset, bool copy); \
     template       Array<T>::~Array        ();                          \
+    template       Node_ptr Array<T>::getNode() const;             \
     template       void Array<T>::eval();                               \
     template       void Array<T>::eval() const;                         \
     template       void      writeHostDataArray<T>    (Array<T> &arr, const T * const data, const size_t bytes); \
diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp
index 1db0ab6347..8c5bda90de 100644
--- a/src/backend/opencl/Array.hpp
+++ b/src/backend/opencl/Array.hpp
@@ -21,6 +21,7 @@
 #include <memory.hpp>
 #include <memory>
 #include <err_common.hpp>
+#include <err_opencl.hpp>
 
 namespace opencl
 {
@@ -69,9 +70,6 @@ namespace opencl
                             const std::vector<af_seq> &index,
                             bool copy=true);
 
-    template<typename T>
-    void evalArray(const Array<T> &A);
-
     // Creates a new Array object on the heap and returns a reference to it.
     template<typename T>
     void destroyArray(Array<T> *A);
@@ -80,10 +78,16 @@ namespace opencl
     void *getDevicePtr(const Array<T>& arr)
     {
         cl::Buffer *buf = arr.device();
-        memPop((T *)buf);
+        memLock((T *)buf);
         return (void *)((*buf)());
     }
 
+    template<typename T>
+    void *getRawPtr(const Array<T>& arr)
+    {
+        return (void *)(arr.get());
+    }
+
     template<typename T>
     class Array
     {
@@ -92,12 +96,12 @@ namespace opencl
         af::dim4 data_dims;
 
         JIT::Node_ptr node;
-        dim_t offset;
         bool ready;
         bool owner;
 
         Array(af::dim4 dims);
-        Array(const Array<T>& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride);
+
+        Array(const Array<T>& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride);
         Array(Param &tmp);
         explicit Array(af::dim4 dims, JIT::Node_ptr n);
         explicit Array(af::dim4 dims, const T * const in_data);
@@ -105,6 +109,9 @@ namespace opencl
 
     public:
 
+        Array(af::dim4 dims, af::dim4 strides, dim_t offset,
+              const T * const in_data, bool is_device = false);
+
         void resetInfo(const af::dim4& dims)        { info.resetInfo(dims);         }
         void resetDims(const af::dim4& dims)        { info.resetDims(dims);         }
         void modDims(const af::dim4 &newDims)       { info.modDims(newDims);        }
@@ -115,7 +122,6 @@ namespace opencl
     RET_TYPE NAME() const { return info.NAME(); }
 
         INFO_FUNC(const af_dtype& ,getType)
-        INFO_FUNC(const af::dim4& ,offsets)
         INFO_FUNC(const af::dim4& ,strides)
         INFO_FUNC(size_t          ,elements)
         INFO_FUNC(size_t          ,ndims)
@@ -185,7 +191,7 @@ namespace opencl
 
         const dim_t getOffset() const
         {
-            return offset;
+            return info.getOffset();
         }
 
         Buffer_ptr getData() const
@@ -200,6 +206,11 @@ namespace opencl
             return isOwner() ? dims() : data_dims;
         }
 
+        void setDataDims(const dim4 &new_dims)
+        {
+            data_dims = new_dims;
+        }
+
         operator Param() const
         {
             KParam info = {{dims()[0], dims()[1], dims()[2], dims()[3]},
@@ -212,6 +223,35 @@ namespace opencl
 
         JIT::Node_ptr getNode() const;
 
+    public:
+        std::shared_ptr<T> getMappedPtr() const
+        {
+            auto func = [=] (void* ptr) {
+                try {
+                    if(ptr != nullptr)
+                        getQueue().enqueueUnmapMemObject(*data, ptr);
+                        ptr = nullptr;
+                } catch(cl::Error err) {
+                    CL_TO_AF_ERROR(err);
+                }
+            };
+
+            T *ptr = nullptr;
+            try {
+                if(ptr == nullptr) {
+                    ptr = (T*)getQueue().enqueueMapBuffer(*const_cast<cl::Buffer*>(get()),
+                                                          true, CL_MAP_READ|CL_MAP_WRITE,
+                                                          getOffset(),
+                                                          (getDataDims().elements() - getOffset())
+                                                          * sizeof(T));
+                }
+            } catch(cl::Error err) {
+                CL_TO_AF_ERROR(err);
+            }
+
+            return std::shared_ptr<T>(ptr, func);
+        }
+
         friend Array<T> createValueArray<T>(const af::dim4 &size, const T& value);
         friend Array<T> createHostDataArray<T>(const af::dim4 &size, const T * const data);
         friend Array<T> createDeviceDataArray<T>(const af::dim4 &size, const void *data);
@@ -226,8 +266,8 @@ namespace opencl
                                           bool copy);
 
         friend void destroyArray<T>(Array<T> *arr);
-        friend void evalArray<T>(const Array<T> &arr);
         friend void *getDevicePtr<T>(const Array<T>& arr);
+        friend void *getRawPtr<T>(const Array<T>& arr);
     };
 
 }
diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt
index 86ba1b2aad..bbe430df15 100644
--- a/src/backend/opencl/CMakeLists.txt
+++ b/src/backend/opencl/CMakeLists.txt
@@ -10,13 +10,26 @@ IF(USE_OPENCL_F77_BLAS)
     ADD_DEFINITIONS(-DUSE_F77_BLAS)
 ENDIF()
 
-IF(USE_OPENCL_MKL)
-    MESSAGE("Using MKL")
+IF(USE_OPENCL_MKL) # Manual MKL Setup
+    MESSAGE("OpenCL Backend Using MKL")
     ADD_DEFINITIONS(-DUSE_MKL)
+ELSE(USE_OPENCL_MKL)
+    IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS
+        MESSAGE("OpenCL Backend Using MKL RT")
+        ADD_DEFINITIONS(-DUSE_MKL)
+    ENDIF()
 ENDIF()
 
 IF(APPLE)
-    FIND_PACKAGE(LAPACK)
+    FIND_PACKAGE(LAPACKE QUIET) # For finding MKL
+    IF(NOT LAPACK_FOUND)
+        # UNSET THE VARIABLES FROM LAPACKE
+        UNSET(LAPACKE_LIB CACHE)
+        UNSET(LAPACK_LIB CACHE)
+        UNSET(LAPACKE_INCLUDES CACHE)
+        UNSET(LAPACKE_ROOT_DIR CACHE)
+        FIND_PACKAGE(LAPACK)
+    ENDIF()
 ELSE(APPLE) # Linux and Windows
     FIND_PACKAGE(LAPACKE)
 ENDIF(APPLE)
@@ -123,6 +136,12 @@ FILE(GLOB conv_ker_headers
 FILE(GLOB conv_ker_sources
      "kernel/convolve/*.cpp")
 
+FILE(GLOB cpu_headers
+     "cpu/*.hpp")
+
+FILE(GLOB cpu_sources
+     "cpu/*.cpp")
+
 source_group(backend\\opencl\\Headers FILES ${opencl_headers})
 source_group(backend\\opencl\\Sources FILES ${opencl_sources})
 source_group(backend\\opencl\\JIT FILES ${jit_sources})
@@ -131,6 +150,8 @@ source_group(backend\\opencl\\kernel\\cl FILES ${opencl_kernels})
 source_group(backend\\opencl\\kernel\\Sources FILES ${kernel_sources})
 source_group(backend\\opencl\\kernel\\convolve\\Headers FILES ${conv_ker_headers})
 source_group(backend\\opencl\\kernel\\convolve\\Sources FILES ${conv_ker_sources})
+source_group(backend\\opencl\\cpu\\Headers FILES ${cpu_headers})
+source_group(backend\\opencl\\cpu\\Sources FILES ${cpu_sources})
 
 IF(LAPACK_FOUND)
     FILE(GLOB magma_sources
@@ -189,10 +210,6 @@ CL_KERNEL_TO_H(
 # OS Definitions
 IF(UNIX)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread -Wno-comment")
-ELSE(${UNIX}) #Windows
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
-    SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj")
 ENDIF()
 
 IF(DEFINED BLAS_SYM_FILE)
@@ -206,6 +223,8 @@ IF(DEFINED BLAS_SYM_FILE)
                 ${kernel_sources}
                 ${conv_ker_headers}
                 ${conv_ker_sources}
+                ${cpu_headers}
+                ${cpu_sources}
                 ${backend_headers}
                 ${backend_sources}
                 ${magma_sources}
@@ -244,6 +263,8 @@ ELSE(DEFINED BLAS_SYM_FILE)
                 ${kernel_sources}
                 ${conv_ker_headers}
                 ${conv_ker_sources}
+                ${cpu_sources}
+                ${cpu_sources}
                 ${backend_headers}
                 ${backend_sources}
                 ${c_headers}
diff --git a/src/backend/opencl/JIT/BinaryNode.hpp b/src/backend/opencl/JIT/BinaryNode.hpp
index f087760b87..b1f6d112b7 100644
--- a/src/backend/opencl/JIT/BinaryNode.hpp
+++ b/src/backend/opencl/JIT/BinaryNode.hpp
@@ -51,6 +51,9 @@ namespace JIT
 
         int setArgs(cl::Kernel &ker, int id)
         {
+            if (m_set_arg) return id;
+            m_set_arg = true;
+
             id = m_lhs->setArgs(ker, id);
             id = m_rhs->setArgs(ker, id);
             return id;
@@ -120,10 +123,7 @@ namespace JIT
 
         void resetFlags()
         {
-            m_set_id = false;
-            m_gen_func = false;
-            m_gen_param = false;
-            m_gen_offset = false;
+            resetCommonFlags();
             m_lhs->resetFlags();
             m_rhs->resetFlags();
         }
diff --git a/src/backend/opencl/JIT/BufferNode.hpp b/src/backend/opencl/JIT/BufferNode.hpp
index 71723b99df..9306d59ef5 100644
--- a/src/backend/opencl/JIT/BufferNode.hpp
+++ b/src/backend/opencl/JIT/BufferNode.hpp
@@ -24,7 +24,6 @@ namespace JIT
         const std::shared_ptr<cl::Buffer> m_data;
         const Param m_param;
         const unsigned m_bytes;
-        bool m_set_arg;
         bool m_linear;
 
     public:
@@ -39,7 +38,6 @@ namespace JIT
               m_data(data),
               m_param(param),
               m_bytes(bytes),
-              m_set_arg(false),
               m_linear(is_linear)
         {}
 
@@ -140,12 +138,7 @@ namespace JIT
 
         void resetFlags()
         {
-            m_set_id = false;
-            m_gen_func = false;
-            m_gen_param = false;
-            m_gen_offset = false;
-            m_gen_name = false;
-            m_set_arg = false;
+            resetCommonFlags();
         }
     };
 
diff --git a/src/backend/opencl/JIT/Node.hpp b/src/backend/opencl/JIT/Node.hpp
index fedf7fb9bd..fc34c09c19 100644
--- a/src/backend/opencl/JIT/Node.hpp
+++ b/src/backend/opencl/JIT/Node.hpp
@@ -32,8 +32,20 @@ namespace JIT
         bool m_gen_func;
         bool m_gen_param;
         bool m_gen_offset;
+        bool m_set_arg;
         bool m_gen_name;
 
+    protected:
+        void resetCommonFlags()
+        {
+            m_set_id = false;
+            m_gen_func = false;
+            m_gen_param = false;
+            m_gen_offset = false;
+            m_set_arg = false;
+            m_gen_name = false;
+        }
+
     public:
 
         Node(const char *type_str, const char *name_str)
@@ -44,6 +56,7 @@ namespace JIT
               m_gen_func(false),
               m_gen_param(false),
               m_gen_offset(false),
+              m_set_arg(false),
               m_gen_name(false)
         {}
 
@@ -64,7 +77,10 @@ namespace JIT
         }
 
 
-        virtual void resetFlags() {}
+        virtual void resetFlags()
+        {
+            resetCommonFlags();
+        }
 
         virtual bool isLinear(dim_t dims[4]) { return true; }
 
diff --git a/src/backend/opencl/JIT/ScalarNode.hpp b/src/backend/opencl/JIT/ScalarNode.hpp
index 9eaa544134..0bba7a2fc9 100644
--- a/src/backend/opencl/JIT/ScalarNode.hpp
+++ b/src/backend/opencl/JIT/ScalarNode.hpp
@@ -24,14 +24,12 @@ namespace JIT
     {
     private:
         const T m_val;
-        bool m_set_arg;
 
     public:
 
         ScalarNode(T val)
             : Node(dtype_traits<T>::getName(), shortname<T>(false)),
-              m_val(val),
-              m_set_arg(false)
+              m_val(val)
         {
         }
 
@@ -101,12 +99,7 @@ namespace JIT
 
         void resetFlags()
         {
-            m_set_id = false;
-            m_gen_func = false;
-            m_gen_param = false;
-            m_gen_offset = false;
-            m_gen_name = false;
-            m_set_arg = false;
+            resetCommonFlags();
         }
     };
 
diff --git a/src/backend/opencl/JIT/UnaryNode.hpp b/src/backend/opencl/JIT/UnaryNode.hpp
index 78fda23e92..e1f32ded8f 100644
--- a/src/backend/opencl/JIT/UnaryNode.hpp
+++ b/src/backend/opencl/JIT/UnaryNode.hpp
@@ -49,6 +49,8 @@ namespace JIT
 
         int setArgs(cl::Kernel &ker, int id)
         {
+            if (m_set_arg) return id;
+            m_set_arg = true;
             return m_child->setArgs(ker, id);
         }
 
@@ -108,10 +110,7 @@ namespace JIT
 
         void resetFlags()
         {
-            m_set_id = false;
-            m_gen_func = false;
-            m_gen_param = false;
-            m_gen_offset = false;
+            resetCommonFlags();
             m_child->resetFlags();
         }
     };
diff --git a/src/backend/opencl/binary.hpp b/src/backend/opencl/binary.hpp
index 4f58cb49e6..11493a5966 100644
--- a/src/backend/opencl/binary.hpp
+++ b/src/backend/opencl/binary.hpp
@@ -22,7 +22,7 @@ namespace opencl
     {
         const char *name()
         {
-            return "noop";
+            return "__invalid";
         }
     };
 
diff --git a/src/backend/opencl/blas.cpp b/src/backend/opencl/blas.cpp
index 6173a684ea..77531154e5 100644
--- a/src/backend/opencl/blas.cpp
+++ b/src/backend/opencl/blas.cpp
@@ -19,6 +19,13 @@
 #include <err_clblas.hpp>
 #include <math.hpp>
 #include <transpose.hpp>
+#include <arith.hpp>
+#include <reduce.hpp>
+#include <complex.hpp>
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <cpu/cpu_blas.hpp>
+#endif
 
 namespace opencl
 {
@@ -113,6 +120,12 @@ template<typename T>
 Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
                 af_mat_prop optLhs, af_mat_prop optRhs)
 {
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+    if(OpenCLCPUOffload(false)) {   // Do not force offload gemm on OSX Intel devices
+        return cpu::matmul(lhs, rhs, optLhs, optRhs);
+    }
+#endif
+
     initBlas();
     clblasTranspose lOpts = toClblasTranspose(optLhs);
     clblasTranspose rOpts = toClblasTranspose(optRhs);
@@ -168,45 +181,15 @@ Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
     return out;
 }
 
-template<typename T, bool conjugate, bool both_conjugate>
-Array<T> dot_(const Array<T> &lhs, const Array<T> &rhs,
-              af_mat_prop optLhs, af_mat_prop optRhs)
-{
-    initBlas();
-
-    int N = lhs.dims()[0];
-    dot_func<T, conjugate> dot;
-    cl::Event event;
-    Array<T> out = createEmptyArray<T>(af::dim4(1));
-    cl::Buffer scratch(getContext(), CL_MEM_READ_WRITE, sizeof(T) * N);
-    CLBLAS_CHECK(
-        dot(N,
-            (*out.get())(), out.getOffset(),
-            (*lhs.get())(),  lhs.getOffset(), lhs.strides()[0],
-            (*rhs.get())(),  rhs.getOffset(), rhs.strides()[0],
-            scratch(),
-            1, &getQueue()(), 0, nullptr, &event())
-        );
-
-    if(both_conjugate)
-        transpose_inplace<T>(out, true);
-
-    return out;
-}
-
 template<typename T>
 Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
              af_mat_prop optLhs, af_mat_prop optRhs)
 {
-    if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) {
-        return dot_<T, false, true>(lhs, rhs, optLhs, optRhs);
-    } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) {
-        return dot_<T, true, false>(lhs, rhs, optLhs, optRhs);
-    } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) {
-        return dot_<T, true, false>(rhs, lhs, optRhs, optLhs);
-    } else {
-        return dot_<T, false, false>(lhs, rhs, optLhs, optRhs);
-    }
+    const Array<T> lhs_ = (optLhs == AF_MAT_NONE ? lhs : conj<T>(lhs));
+    const Array<T> rhs_ = (optRhs == AF_MAT_NONE ? rhs : conj<T>(rhs));
+
+    const Array<T> temp = arithOp<T, af_mul_t>(lhs_, rhs_, lhs_.dims());
+    return reduce<af_add_t, T, T>(temp, 0, false, 0);
 }
 
 #define INSTANTIATE_BLAS(TYPE)                                                          \
diff --git a/src/backend/opencl/cholesky.cpp b/src/backend/opencl/cholesky.cpp
index 78fe999645..a2034a331a 100644
--- a/src/backend/opencl/cholesky.cpp
+++ b/src/backend/opencl/cholesky.cpp
@@ -8,14 +8,16 @@
  ********************************************************/
 
 #include <cholesky.hpp>
-#include <copy.hpp>
 #include <err_common.hpp>
-#include <blas.hpp>
 #include <err_opencl.hpp>
+#include <blas.hpp>
+#include <copy.hpp>
 
 #if defined(WITH_OPENCL_LINEAR_ALGEBRA)
 #include <magma/magma.h>
 #include <triangle.hpp>
+#include <platform.hpp>
+#include <cpu/cpu_cholesky.hpp>
 
 namespace opencl
 {
@@ -24,6 +26,10 @@ template<typename T>
 int cholesky_inplace(Array<T> &in, const bool is_upper)
 {
     try {
+        if(OpenCLCPUOffload()) {
+            return cpu::cholesky_inplace(in, is_upper);
+        }
+
         initBlas();
 
         dim4 iDims = in.dims();
@@ -46,6 +52,9 @@ template<typename T>
 Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
 {
     try {
+        if(OpenCLCPUOffload()) {
+            return cpu::cholesky(info, in, is_upper);
+        }
 
         Array<T> out = copyArray<T>(in);
         *info = cholesky_inplace(out, is_upper);
diff --git a/src/backend/opencl/copy.cpp b/src/backend/opencl/copy.cpp
index 39cbf4b59d..e1716f1632 100644
--- a/src/backend/opencl/copy.cpp
+++ b/src/backend/opencl/copy.cpp
@@ -29,7 +29,7 @@ namespace opencl
         cl::Buffer buf;
         Array<T> out = A;
 
-        if (A.isOwner() || // No offsets, No strides
+        if (A.isLinear() || // No offsets, No strides
             A.ndims() == 1 // Simple offset, no strides.
             ) {
             buf = *A.get();
diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp
new file mode 100644
index 0000000000..724c6bb1e9
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_blas.cpp
@@ -0,0 +1,210 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <cpu/cpu_helper.hpp>
+#include <cpu/cpu_blas.hpp>
+#include <math.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+
+using std::add_const;
+using std::add_pointer;
+using std::enable_if;
+using std::is_floating_point;
+using std::remove_const;
+using std::conditional;
+
+// Some implementations of BLAS require void* for complex pointers while others use float*/double*
+//
+// Sample cgemm API
+// OpenBLAS
+// void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
+//                  OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+//                  OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda,
+//                  OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta,
+//                  float *C, OPENBLAS_CONST blasint ldc);
+//
+// MKL
+// void cblas_cgemm(const  CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, const  CBLAS_TRANSPOSE TransB,
+//                  const MKL_INT M, const MKL_INT N, const MKL_INT K,
+//                  const void *alpha, const void *A, const MKL_INT lda,
+//                  const void *B, const MKL_INT ldb, const void *beta,
+//                  void *C, const MKL_INT ldc);
+// atlas cblas
+// void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+//                  const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+//                  const void *alpha, const void *A, const int lda,
+//                  const void *B, const int ldb, const void *beta,
+//                  void *C, const int ldc);
+//
+// LAPACKE
+// void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+//                  const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+//                  const void *alpha, const void *A, const int lda,
+//                  const void *B, const int ldb, const void *beta,
+//                  void *C, const int ldc);
+#if defined(IS_OPENBLAS)
+    static const bool cplx_void_ptr = false;
+#else
+    static const bool cplx_void_ptr = true;
+#endif
+
+template<typename T, class Enable = void>
+struct blas_base {
+    using type = typename dtype_traits<T>::base_type;
+};
+
+template<typename T>
+struct blas_base <T, typename enable_if<is_complex<T>::value && cplx_void_ptr>::type> {
+    using type = void;
+};
+
+
+template<typename T>
+using cptr_type     =   typename conditional<   is_complex<T>::value,
+                                                const typename blas_base<T>::type *,
+                                                const T*>::type;
+template<typename T>
+using ptr_type     =    typename conditional<   is_complex<T>::value,
+                                                typename blas_base<T>::type *,
+                                                T*>::type;
+template<typename T>
+using scale_type   =    typename conditional<   is_complex<T>::value,
+                                                const typename blas_base<T>::type *,
+                                                const T>::type;
+
+template<typename T>
+using gemm_func_def = void (*)( const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE,
+                                const blasint, const blasint, const blasint,
+                                scale_type<T>, cptr_type<T>, const blasint,
+                                cptr_type<T>, const blasint,
+                                scale_type<T>, ptr_type<T>, const blasint);
+
+template<typename T>
+using gemv_func_def = void (*)( const CBLAS_ORDER, const CBLAS_TRANSPOSE,
+                                const blasint, const blasint,
+                                scale_type<T>, cptr_type<T>, const blasint,
+                                cptr_type<T>, const blasint,
+                                scale_type<T>, ptr_type<T>, const blasint);
+
+#define BLAS_FUNC_DEF( FUNC )                           \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+#define BLAS_FUNC( FUNC, TYPE, PREFIX )                 \
+  template<> FUNC##_func_def<TYPE> FUNC##_func<TYPE>()  \
+{ return &cblas_##PREFIX##FUNC; }
+
+BLAS_FUNC_DEF( gemm )
+BLAS_FUNC(gemm , float   , s)
+BLAS_FUNC(gemm , double  , d)
+BLAS_FUNC(gemm , cfloat  , c)
+BLAS_FUNC(gemm , cdouble , z)
+
+BLAS_FUNC_DEF(gemv)
+BLAS_FUNC(gemv , float   , s)
+BLAS_FUNC(gemv , double  , d)
+BLAS_FUNC(gemv , cfloat  , c)
+BLAS_FUNC(gemv , cdouble , z)
+
+template<typename T, int value>
+typename enable_if<is_floating_point<T>::value, scale_type<T>>::type
+getScale() { return T(value); }
+
+template<typename T, int value>
+typename enable_if<is_complex<T>::value, scale_type<T>>::type
+getScale()
+{
+    static T val = scalar<T>(value);
+    return (const typename blas_base<T>::type *)&val;
+}
+
+CBLAS_TRANSPOSE
+toCblasTranspose(af_mat_prop opt)
+{
+    CBLAS_TRANSPOSE out = CblasNoTrans;
+    switch(opt) {
+        case AF_MAT_NONE        : out = CblasNoTrans;   break;
+        case AF_MAT_TRANS       : out = CblasTrans;     break;
+        case AF_MAT_CTRANS      : out = CblasConjTrans; break;
+        default                 : AF_ERROR("INVALID af_mat_prop", AF_ERR_ARG);
+    }
+    return out;
+}
+
+template<typename T>
+Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
+                af_mat_prop optLhs, af_mat_prop optRhs)
+{
+    CBLAS_TRANSPOSE lOpts = toCblasTranspose(optLhs);
+    CBLAS_TRANSPOSE rOpts = toCblasTranspose(optRhs);
+
+    int aRowDim = (lOpts == CblasNoTrans) ? 0 : 1;
+    int aColDim = (lOpts == CblasNoTrans) ? 1 : 0;
+    int bColDim = (rOpts == CblasNoTrans) ? 1 : 0;
+
+    dim4 lDims = lhs.dims();
+    dim4 rDims = rhs.dims();
+    int M = lDims[aRowDim];
+    int N = rDims[bColDim];
+    int K = lDims[aColDim];
+
+    //FIXME: Leaks on errors.
+    Array<T> out = createValueArray<T>(af::dim4(M, N, 1, 1), scalar<T>(0));
+    auto alpha = getScale<T, 1>();
+    auto beta  = getScale<T, 0>();
+
+    dim4 lStrides = lhs.strides();
+    dim4 rStrides = rhs.strides();
+    using BT  =       typename blas_base<T>::type;
+
+    // get host pointers from mapped memory
+    auto lPtr = lhs.getMappedPtr();
+    auto rPtr = rhs.getMappedPtr();
+    auto oPtr = out.getMappedPtr();
+
+    if(rDims[bColDim] == 1) {
+        N = lDims[aColDim];
+        gemv_func<T>()(
+            CblasColMajor, lOpts,
+            lDims[0], lDims[1],
+            alpha,
+            (BT*)lPtr.get(), lStrides[1],
+            (BT*)rPtr.get(), rStrides[0],
+            beta,
+            (BT*)oPtr.get(), 1);
+    } else {
+        gemm_func<T>()(
+            CblasColMajor, lOpts, rOpts,
+            M, N, K,
+            alpha,
+            (BT*)lPtr.get(), lStrides[1],
+            (BT*)rPtr.get(), rStrides[1],
+            beta,
+            (BT*)oPtr.get(), out.dims()[0]);
+    }
+
+    return out;
+}
+
+#define INSTANTIATE_BLAS(TYPE)                                                          \
+    template Array<TYPE> matmul<TYPE>(const Array<TYPE> &lhs, const Array<TYPE> &rhs,   \
+                                      af_mat_prop optLhs, af_mat_prop optRhs);
+
+INSTANTIATE_BLAS(float)
+INSTANTIATE_BLAS(cfloat)
+INSTANTIATE_BLAS(double)
+INSTANTIATE_BLAS(cdouble)
+
+}
+}
+#endif
diff --git a/src/backend/opencl/cpu/cpu_blas.hpp b/src/backend/opencl/cpu/cpu_blas.hpp
new file mode 100644
index 0000000000..908742471d
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_blas.hpp
@@ -0,0 +1,20 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+    template<typename T>
+    Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
+                    af_mat_prop optLhs, af_mat_prop optRhs);
+}
+}
diff --git a/src/backend/opencl/cpu/cpu_cholesky.cpp b/src/backend/opencl/cpu/cpu_cholesky.cpp
new file mode 100644
index 0000000000..9acbcc4fad
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_cholesky.cpp
@@ -0,0 +1,84 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <cpu/cpu_helper.hpp>
+#include <cpu/cpu_cholesky.hpp>
+#include <cpu/cpu_triangle.hpp>
+#include <copy.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+
+template<typename T>
+using potrf_func_def = int (*)(ORDER_TYPE, char,
+                               int,
+                               T*, int);
+
+#define CH_FUNC_DEF( FUNC )                                     \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define CH_FUNC( FUNC, TYPE, PREFIX )                           \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()        \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+CH_FUNC_DEF( potrf )
+CH_FUNC(potrf , float  , s)
+CH_FUNC(potrf , double , d)
+CH_FUNC(potrf , cfloat , c)
+CH_FUNC(potrf , cdouble, z)
+
+template<typename T>
+Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
+{
+    Array<T> out = copyArray<T>(in);
+    *info = cholesky_inplace(out, is_upper);
+
+    std::shared_ptr<T> oPtr = out.getMappedPtr();
+
+    if (is_upper) triangle<T, true , false>(oPtr.get(), oPtr.get(), out.dims(), out.strides(), out.strides());
+    else          triangle<T, false, false>(oPtr.get(), oPtr.get(), out.dims(), out.strides(), out.strides());
+
+    return out;
+}
+
+template<typename T>
+int cholesky_inplace(Array<T> &in, const bool is_upper)
+{
+    dim4 iDims = in.dims();
+    int N = iDims[0];
+
+    char uplo = 'L';
+    if(is_upper)
+        uplo = 'U';
+
+    std::shared_ptr<T> inPtr = in.getMappedPtr();
+
+    int info = potrf_func<T>()(AF_LAPACK_COL_MAJOR, uplo,
+                               N, inPtr.get(), in.strides()[1]);
+
+    return info;
+}
+
+#define INSTANTIATE_CH(T)                                                                   \
+    template int cholesky_inplace<T>(Array<T> &in, const bool is_upper);                    \
+    template Array<T> cholesky<T>   (int *info, const Array<T> &in, const bool is_upper);   \
+
+
+INSTANTIATE_CH(float)
+INSTANTIATE_CH(cfloat)
+INSTANTIATE_CH(double)
+INSTANTIATE_CH(cdouble)
+
+}
+}
+#endif
diff --git a/src/backend/opencl/cpu/cpu_cholesky.hpp b/src/backend/opencl/cpu/cpu_cholesky.hpp
new file mode 100644
index 0000000000..041e93980e
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_cholesky.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+    template<typename T>
+    Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper);
+
+    template<typename T>
+    int cholesky_inplace(Array<T> &in, const bool is_upper);
+}
+}
diff --git a/src/backend/opencl/cpu/cpu_helper.hpp b/src/backend/opencl/cpu/cpu_helper.hpp
new file mode 100644
index 0000000000..f7f690322c
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_helper.hpp
@@ -0,0 +1,74 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#ifndef AF_OPENCL_CPU
+#define AF_OPENCL_CPU
+
+#include <af/defines.h>
+#include <Array.hpp>
+#include <memory.hpp>
+#include <types.hpp>
+#include <err_common.hpp>
+#include <platform.hpp>
+
+//********************************************************/
+// LAPACK
+//********************************************************/
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+
+#define lapack_complex_float opencl::cfloat
+#define lapack_complex_double opencl::cdouble
+#define LAPACK_PREFIX LAPACKE_
+#define ORDER_TYPE int
+#define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR
+#define LAPACK_NAME(fn) LAPACKE_##fn
+
+#ifdef USE_MKL
+    #include<mkl_lapacke.h>
+#else
+    #ifdef __APPLE__
+        #include <Accelerate/Accelerate.h>
+        #include <lapacke.hpp>
+        #undef AF_LAPACK_COL_MAJOR
+        #define AF_LAPACK_COL_MAJOR 0
+    #else // NETLIB LAPACKE
+        #include<lapacke.h>
+    #endif
+#endif
+
+#endif // WITH_OPENCL_LINEAR_ALGEBRA
+
+//********************************************************/
+// BLAS
+//********************************************************/
+#ifdef USE_MKL
+    #include <mkl_cblas.h>
+#else
+    #ifdef __APPLE__
+        #include <Accelerate/Accelerate.h>
+    #else
+        extern "C" {
+            #include <cblas.h>
+        }
+    #endif
+#endif
+
+// TODO: Ask upstream for a more official way to detect it
+#ifdef OPENBLAS_CONST
+#define IS_OPENBLAS
+#endif
+
+// Make sure we get the correct type signature for OpenBLAS
+// OpenBLAS defines blasint as it's index type. Emulate this
+// if we're not dealing with openblas and use it where applicable
+#ifndef IS_OPENBLAS
+typedef int blasint;
+#endif
+
+#endif
diff --git a/src/backend/opencl/cpu/cpu_inverse.cpp b/src/backend/opencl/cpu/cpu_inverse.cpp
new file mode 100644
index 0000000000..4f73a80707
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_inverse.cpp
@@ -0,0 +1,76 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <cpu/cpu_helper.hpp>
+#include <cpu/cpu_inverse.hpp>
+#include <cpu/cpu_lu.hpp>
+#include <copy.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+
+template<typename T>
+using getri_func_def = int (*)(ORDER_TYPE, int,
+                               T *, int,
+                               const int *);
+
+#define INV_FUNC_DEF( FUNC )                                        \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+#define INV_FUNC( FUNC, TYPE, PREFIX )                              \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()            \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+INV_FUNC_DEF( getri )
+INV_FUNC(getri , float  , s)
+INV_FUNC(getri , double , d)
+INV_FUNC(getri , cfloat , c)
+INV_FUNC(getri , cdouble, z)
+
+template<typename T>
+Array<T> inverse(const Array<T> &in)
+{
+    int M = in.dims()[0];
+    //int N = in.dims()[1];
+
+    // This condition is already handled in opencl/inverse.cpp
+    //if (M != N) {
+        //Array<T> I = identity<T>(in.dims());
+        //return solve(in, I);
+    //}
+
+    Array<T> A = copyArray<T>(in);
+
+    Array<int> pivot = cpu::lu_inplace<T>(A, false);
+
+
+    std::shared_ptr<T>   aPtr = A.getMappedPtr();
+    std::shared_ptr<int> pPtr = pivot.getMappedPtr();
+
+    getri_func<T>()(AF_LAPACK_COL_MAJOR, M,
+                    aPtr.get(), A.strides()[1],
+                    pPtr.get());
+
+    return A;
+}
+
+#define INSTANTIATE(T)                                                                   \
+    template Array<T> inverse<T> (const Array<T> &in);
+
+INSTANTIATE(float)
+INSTANTIATE(cfloat)
+INSTANTIATE(double)
+INSTANTIATE(cdouble)
+
+}
+}
+#endif
diff --git a/src/backend/opencl/cpu/cpu_inverse.hpp b/src/backend/opencl/cpu/cpu_inverse.hpp
new file mode 100644
index 0000000000..38581a1906
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_inverse.hpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+    template<typename T>
+    Array<T> inverse(const Array<T> &in);
+}
+}
diff --git a/src/backend/opencl/cpu/cpu_lu.cpp b/src/backend/opencl/cpu/cpu_lu.cpp
new file mode 100644
index 0000000000..e0234fb7de
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_lu.cpp
@@ -0,0 +1,178 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <cpu/cpu_helper.hpp>
+#include <cpu/cpu_lu.hpp>
+#include <math.hpp>
+#include <copy.hpp>
+#include <range.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+
+template<typename T>
+using getrf_func_def = int (*)(ORDER_TYPE, int, int,
+                               T*, int,
+                               int*);
+
+#define LU_FUNC_DEF( FUNC )                                     \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define LU_FUNC( FUNC, TYPE, PREFIX )                           \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()        \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+LU_FUNC_DEF( getrf )
+LU_FUNC(getrf , float  , s)
+LU_FUNC(getrf , double , d)
+LU_FUNC(getrf , cfloat , c)
+LU_FUNC(getrf , cdouble, z)
+
+template<typename T>
+void lu_split(Array<T> &lower, Array<T> &upper, const Array<T> &in)
+{
+    std::shared_ptr<T> ls = lower.getMappedPtr();
+    std::shared_ptr<T> us = upper.getMappedPtr();
+    std::shared_ptr<T> is = in.getMappedPtr();
+
+    T *l = ls.get();
+    T *u = us.get();
+    T *i = is.get();
+
+    dim4 ldm = lower.dims();
+    dim4 udm = upper.dims();
+    dim4 idm = in.dims();
+
+    dim4 lst = lower.strides();
+    dim4 ust = upper.strides();
+    dim4 ist = in.strides();
+
+    for(dim_t ow = 0; ow < idm[3]; ow++) {
+        const dim_t lW = ow * lst[3];
+        const dim_t uW = ow * ust[3];
+        const dim_t iW = ow * ist[3];
+
+        for(dim_t oz = 0; oz < idm[2]; oz++) {
+            const dim_t lZW = lW + oz * lst[2];
+            const dim_t uZW = uW + oz * ust[2];
+            const dim_t iZW = iW + oz * ist[2];
+
+            for(dim_t oy = 0; oy < idm[1]; oy++) {
+                const dim_t lYZW = lZW + oy * lst[1];
+                const dim_t uYZW = uZW + oy * ust[1];
+                const dim_t iYZW = iZW + oy * ist[1];
+
+                for(dim_t ox = 0; ox < idm[0]; ox++) {
+                    const dim_t lMem = lYZW + ox;
+                    const dim_t uMem = uYZW + ox;
+                    const dim_t iMem = iYZW + ox;
+                    if(ox > oy) {
+                        if(oy < ldm[1])
+                            l[lMem] = i[iMem];
+                        if(ox < udm[0])
+                            u[uMem] = scalar<T>(0);
+                    } else if (oy > ox) {
+                        if(oy < ldm[1])
+                            l[lMem] = scalar<T>(0);
+                        if(ox < udm[0])
+                            u[uMem] = i[iMem];
+                    } else if(ox == oy) {
+                        if(oy < ldm[1])
+                            l[lMem] = scalar<T>(1.0);
+                        if(ox < udm[0])
+                            u[uMem] = i[iMem];
+                    }
+                }
+            }
+        }
+    }
+}
+
+void convertPivot(Array<int> &pivot, int out_sz)
+{
+    Array<int> p = range<int>(dim4(out_sz), 0); // Runs opencl
+
+    std::shared_ptr<int> pi = pivot.getMappedPtr();
+    std::shared_ptr<int> po = p.getMappedPtr();
+
+    int *d_pi = pi.get();
+    int *d_po = po.get();
+
+    dim_t d0 = pivot.dims()[0];
+
+    for(int j = 0; j < (int)d0; j++) {
+        // 1 indexed in pivot
+        std::swap(d_po[j], d_po[d_pi[j] - 1]);
+    }
+
+    pi.reset();
+    po.reset();
+
+    pivot = p;
+}
+
+template<typename T>
+void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    Array<T> in_copy = copyArray<T>(in);
+    pivot = lu_inplace(in_copy);
+
+    // SPLIT into lower and upper
+    dim4 ldims(M, min(M, N));
+    dim4 udims(min(M, N), N);
+    lower = createEmptyArray<T>(ldims);
+    upper = createEmptyArray<T>(udims);
+
+    lu_split<T>(lower, upper, in_copy);
+}
+
+template<typename T>
+Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    Array<int> pivot = createEmptyArray<int>(af::dim4(min(M, N), 1, 1, 1));
+
+    std::shared_ptr<T>   inPtr = in.getMappedPtr();
+    std::shared_ptr<int> piPtr = pivot.getMappedPtr();
+
+    getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+                    inPtr.get(), in.strides()[1],
+                    piPtr.get());
+
+    inPtr.reset();
+    piPtr.reset();
+
+    if(convert_pivot) convertPivot(pivot, M);
+
+    return pivot;
+}
+
+#define INSTANTIATE_LU(T)                                                                           \
+    template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
+    template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+INSTANTIATE_LU(float)
+INSTANTIATE_LU(cfloat)
+INSTANTIATE_LU(double)
+INSTANTIATE_LU(cdouble)
+
+}
+}
+#endif
diff --git a/src/backend/opencl/cpu/cpu_lu.hpp b/src/backend/opencl/cpu/cpu_lu.hpp
new file mode 100644
index 0000000000..6c038f20c7
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_lu.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+    template<typename T>
+    void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+    template<typename T>
+    Array<int> lu_inplace(Array<T> &in, const bool convert_pivot = true);
+}
+}
diff --git a/src/backend/opencl/cpu/cpu_qr.cpp b/src/backend/opencl/cpu/cpu_qr.cpp
new file mode 100644
index 0000000000..737a7aec2f
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_qr.cpp
@@ -0,0 +1,118 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <cpu/cpu_helper.hpp>
+#include <cpu/cpu_qr.hpp>
+#include <cpu/cpu_triangle.hpp>
+#include <copy.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+
+template<typename T>
+using geqrf_func_def = int (*)(ORDER_TYPE, int, int,
+                               T*, int,
+                               T*);
+
+template<typename T>
+using gqr_func_def = int (*)(ORDER_TYPE, int, int, int,
+                             T*, int,
+                             const T*);
+
+#define QR_FUNC_DEF( FUNC )                                         \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define QR_FUNC( FUNC, TYPE, PREFIX )                               \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()            \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+QR_FUNC_DEF( geqrf )
+QR_FUNC(geqrf , float  , s)
+QR_FUNC(geqrf , double , d)
+QR_FUNC(geqrf , cfloat , c)
+QR_FUNC(geqrf , cdouble, z)
+
+#define GQR_FUNC_DEF( FUNC )                                         \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+#define GQR_FUNC( FUNC, TYPE, PREFIX )                               \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()             \
+{ return & LAPACK_NAME(PREFIX); }
+
+GQR_FUNC_DEF( gqr )
+GQR_FUNC(gqr , float  , sorgqr)
+GQR_FUNC(gqr , double , dorgqr)
+GQR_FUNC(gqr , cfloat , cungqr)
+GQR_FUNC(gqr , cdouble, zungqr)
+
+template<typename T>
+void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    dim4 padDims(M, max(M, N));
+    q = padArray<T, T>(in, padDims, scalar<T>(0));
+    q.resetDims(iDims);
+    t = qr_inplace(q);
+
+    // SPLIT into q and r
+    dim4 rdims(M, N);
+    r = createEmptyArray<T>(rdims);
+
+    std::shared_ptr<T> qPtr = q.getMappedPtr();
+    std::shared_ptr<T> rPtr = r.getMappedPtr();
+    std::shared_ptr<T> tPtr = t.getMappedPtr();
+
+    triangle<T, true, false>(rPtr.get(), qPtr.get(), rdims, r.strides(), q.strides());
+
+    gqr_func<T>()(AF_LAPACK_COL_MAJOR,
+                  M, M, min(M, N),
+                  qPtr.get(), q.strides()[1],
+                  tPtr.get());
+
+    q.resetDims(dim4(M, M));
+}
+
+template<typename T>
+Array<T> qr_inplace(Array<T> &in)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    Array<T> t = createEmptyArray<T>(af::dim4(min(M, N), 1, 1, 1));
+
+    std::shared_ptr<T> iPtr = in.getMappedPtr();
+    std::shared_ptr<T> tPtr = t.getMappedPtr();
+
+    geqrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+                    iPtr.get(), in.strides()[1],
+                    tPtr.get());
+
+    return t;
+}
+
+#define INSTANTIATE_QR(T)                                                                           \
+    template Array<T> qr_inplace<T>(Array<T> &in);                                                \
+    template void qr<T>(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
+
+INSTANTIATE_QR(float)
+INSTANTIATE_QR(cfloat)
+INSTANTIATE_QR(double)
+INSTANTIATE_QR(cdouble)
+
+}
+}
+#endif
diff --git a/src/backend/opencl/cpu/cpu_qr.hpp b/src/backend/opencl/cpu/cpu_qr.hpp
new file mode 100644
index 0000000000..c499b9d03b
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_qr.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+    template<typename T>
+    void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
+
+    template<typename T>
+    Array<T> qr_inplace(Array<T> &in);
+}
+}
diff --git a/src/backend/opencl/cpu/cpu_solve.cpp b/src/backend/opencl/cpu/cpu_solve.cpp
new file mode 100644
index 0000000000..1bb72f8768
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_solve.cpp
@@ -0,0 +1,176 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <cpu/cpu_helper.hpp>
+#include <cpu/cpu_solve.hpp>
+#include <copy.hpp>
+#include <math.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+
+template<typename T>
+using gesv_func_def = int (*)(ORDER_TYPE, int, int,
+                              T *, int,
+                              int *,
+                              T *, int);
+
+template<typename T>
+using gels_func_def = int (*)(ORDER_TYPE, char,
+                              int, int, int,
+                              T *, int,
+                              T *, int);
+
+template<typename T>
+using getrs_func_def = int (*)(ORDER_TYPE, char,
+                               int, int,
+                               const T *, int,
+                               const int *,
+                               T *, int);
+
+template<typename T>
+using trtrs_func_def = int (*)(ORDER_TYPE,
+                               char, char, char,
+                               int, int,
+                               const T *, int,
+                               T *, int);
+
+
+#define SOLVE_FUNC_DEF( FUNC )                                      \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define SOLVE_FUNC( FUNC, TYPE, PREFIX )                            \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()            \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+SOLVE_FUNC_DEF( gesv )
+SOLVE_FUNC(gesv , float  , s)
+SOLVE_FUNC(gesv , double , d)
+SOLVE_FUNC(gesv , cfloat , c)
+SOLVE_FUNC(gesv , cdouble, z)
+
+SOLVE_FUNC_DEF( gels )
+SOLVE_FUNC(gels , float  , s)
+SOLVE_FUNC(gels , double , d)
+SOLVE_FUNC(gels , cfloat , c)
+SOLVE_FUNC(gels , cdouble, z)
+
+SOLVE_FUNC_DEF( getrs )
+SOLVE_FUNC(getrs , float  , s)
+SOLVE_FUNC(getrs , double , d)
+SOLVE_FUNC(getrs , cfloat , c)
+SOLVE_FUNC(getrs , cdouble, z)
+
+SOLVE_FUNC_DEF( trtrs )
+SOLVE_FUNC(trtrs , float  , s)
+SOLVE_FUNC(trtrs , double , d)
+SOLVE_FUNC(trtrs , cfloat , c)
+SOLVE_FUNC(trtrs , cdouble, z)
+
+template<typename T>
+Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
+                 const Array<T> &b, const af_mat_prop options)
+{
+    int N = A.dims()[0];
+    int NRHS = b.dims()[1];
+
+    Array<T> B = copyArray<T>(b);
+
+    std::shared_ptr<T  > aPtr = A.getMappedPtr();
+    std::shared_ptr<T  > bPtr = B.getMappedPtr();
+    std::shared_ptr<int> pPtr = pivot.getMappedPtr();
+
+    getrs_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
+                    N, NRHS,
+                    aPtr.get(), A.strides()[1],
+                    pPtr.get(),
+                    bPtr.get(), B.strides()[1]);
+
+    return B;
+}
+
+template<typename T>
+Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options)
+{
+    Array<T> B = copyArray<T>(b);
+    int N = B.dims()[0];
+    int NRHS = B.dims()[1];
+
+    std::shared_ptr<T> aPtr = A.getMappedPtr();
+    std::shared_ptr<T> bPtr = B.getMappedPtr();
+
+    trtrs_func<T>()(AF_LAPACK_COL_MAJOR,
+                    options & AF_MAT_UPPER ? 'U' : 'L',
+                    'N', // transpose flag
+                    options & AF_MAT_DIAG_UNIT ? 'U' : 'N',
+                    N, NRHS,
+                    aPtr.get(), A.strides()[1],
+                    bPtr.get(), B.strides()[1]);
+
+    return B;
+}
+
+
+template<typename T>
+Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
+{
+
+    if (options & AF_MAT_UPPER ||
+        options & AF_MAT_LOWER) {
+        return triangleSolve<T>(a, b, options);
+    }
+
+    int M = a.dims()[0];
+    int N = a.dims()[1];
+    int K = b.dims()[1];
+
+    Array<T> A = copyArray<T>(a);
+    Array<T> B = padArray<T, T>(b, dim4(max(M, N), K), scalar<T>(0));
+
+    std::shared_ptr<T> aPtr = A.getMappedPtr();
+    std::shared_ptr<T> bPtr = B.getMappedPtr();
+
+    if(M == N) {
+        std::vector<int> pivot(N);
+        gesv_func<T>()(AF_LAPACK_COL_MAJOR, N, K,
+                       aPtr.get(), A.strides()[1],
+                       &pivot.front(),
+                       bPtr.get(), B.strides()[1]);
+    } else {
+        int sM = a.strides()[1];
+        int sN = a.strides()[2] / sM;
+
+        gels_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
+                       M, N, K,
+                       aPtr.get(), A.strides()[1],
+                       bPtr.get(), max(sM, sN));
+        B.resetDims(dim4(N, K));
+    }
+
+    return B;
+}
+
+#define INSTANTIATE_SOLVE(T)                                            \
+    template Array<T> solve<T>(const Array<T> &a, const Array<T> &b,    \
+                               const af_mat_prop options);              \
+    template Array<T> solveLU<T>(const Array<T> &A, const Array<int> &pivot, \
+                                 const Array<T> &b, const af_mat_prop options); \
+
+INSTANTIATE_SOLVE(float)
+INSTANTIATE_SOLVE(cfloat)
+INSTANTIATE_SOLVE(double)
+INSTANTIATE_SOLVE(cdouble)
+
+}
+}
+#endif
diff --git a/src/backend/opencl/cpu/cpu_solve.hpp b/src/backend/opencl/cpu/cpu_solve.hpp
new file mode 100644
index 0000000000..6c3de642ad
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_solve.hpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+    template<typename T>
+    Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options = AF_MAT_NONE);
+
+    template<typename T>
+    Array<T> solveLU(const Array<T> &a, const Array<int> &pivot,
+                     const Array<T> &b, const af_mat_prop options = AF_MAT_NONE);
+}
+}
diff --git a/src/backend/opencl/cpu/cpu_svd.cpp b/src/backend/opencl/cpu/cpu_svd.cpp
new file mode 100644
index 0000000000..3608bf69ce
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_svd.cpp
@@ -0,0 +1,112 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <cpu/cpu_helper.hpp>
+#include <cpu/cpu_svd.hpp>
+#include <copy.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+
+#define SVD_FUNC_DEF( FUNC )                                            \
+    template<typename T,typename Tr> svd_func_def<T, Tr> svd_func();
+
+#define SVD_FUNC( FUNC, T, Tr, PREFIX )                     \
+    template<> svd_func_def<T, Tr>     svd_func<T, Tr>()    \
+    { return & LAPACK_NAME(PREFIX##FUNC); }
+
+#if defined(USE_MKL) || defined(__APPLE__)
+
+    template<typename T, typename Tr>
+    using svd_func_def = int (*)(ORDER_TYPE,
+                                 char jobz,
+                                 int m, int n,
+                                 T* in, int ldin,
+                                 Tr* s,
+                                 T* u, int ldu,
+                                 T* vt, int ldvt);
+
+    SVD_FUNC_DEF( gesdd )
+    SVD_FUNC(gesdd, float  , float , s)
+    SVD_FUNC(gesdd, double , double, d)
+    SVD_FUNC(gesdd, cfloat , float , c)
+    SVD_FUNC(gesdd, cdouble, double, z)
+
+#else   // Atlas causes memory freeing issues with using gesdd
+
+    template<typename T, typename Tr>
+    using svd_func_def = int (*)(ORDER_TYPE,
+                                 char jobu, char jobvt,
+                                 int m, int n,
+                                 T* in, int ldin,
+                                 Tr* s,
+                                 T* u, int ldu,
+                                 T* vt, int ldvt,
+                                 Tr *superb);
+
+    SVD_FUNC_DEF( gesvd )
+    SVD_FUNC(gesvd, float  , float , s)
+    SVD_FUNC(gesvd, double , double, d)
+    SVD_FUNC(gesvd, cfloat , float , c)
+    SVD_FUNC(gesvd, cdouble, double, z)
+
+#endif
+
+    template <typename T, typename Tr>
+    void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+    {
+        dim4 iDims = in.dims();
+        int M = iDims[0];
+        int N = iDims[1];
+
+        std::shared_ptr<Tr> sPtr = s.getMappedPtr();
+        std::shared_ptr<T > uPtr = u.getMappedPtr();
+        std::shared_ptr<T > vPtr = vt.getMappedPtr();
+        std::shared_ptr<T > iPtr = in.getMappedPtr();
+
+#if defined(USE_MKL) || defined(__APPLE__)
+        svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A',
+                          M, N,
+                          iPtr.get(), in.strides()[1],
+                          sPtr.get(),
+                          uPtr.get(), u.strides()[1],
+                          vPtr.get(), vt.strides()[1]);
+#else
+        std::vector<Tr> superb(std::min(M, N));
+        svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', 'A',
+                          M, N,
+                          iPtr.get(), in.strides()[1],
+                          sPtr.get(),
+                          uPtr.get(), u.strides()[1],
+                          vPtr.get(), vt.strides()[1],
+                          &superb[0]);
+#endif
+    }
+
+    template <typename T, typename Tr>
+    void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+    {
+        Array<T> in_copy = copyArray<T>(in);
+        svdInPlace(s, u, vt, in_copy);
+    }
+
+#define INSTANTIATE_SVD(T, Tr)                                          \
+    template void svd<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, const Array<T> &in); \
+    template void svdInPlace<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, Array<T> &in);
+
+    INSTANTIATE_SVD(float  , float )
+    INSTANTIATE_SVD(double , double)
+    INSTANTIATE_SVD(cfloat , float )
+    INSTANTIATE_SVD(cdouble, double)
+}
+}
+#endif
diff --git a/src/backend/opencl/cpu/cpu_svd.hpp b/src/backend/opencl/cpu/cpu_svd.hpp
new file mode 100644
index 0000000000..4f271af8b9
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_svd.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+    template<typename T, typename Tr>
+    void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in);
+
+    template<typename T, typename Tr>
+    void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in);
+}
+}
diff --git a/src/backend/opencl/cpu/cpu_triangle.hpp b/src/backend/opencl/cpu/cpu_triangle.hpp
new file mode 100644
index 0000000000..e705420582
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_triangle.hpp
@@ -0,0 +1,57 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#ifndef CPU_LAPACK_TRIANGLE
+#define CPU_LAPACK_TRIANGLE
+
+#include <math.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+
+template<typename T, bool is_upper, bool is_unit_diag>
+void triangle(T *o, const T *i, const dim4 odm, const dim4 ost, const dim4 ist)
+{
+    for(dim_t ow = 0; ow < odm[3]; ow++) {
+        const dim_t oW = ow * ost[3];
+        const dim_t iW = ow * ist[3];
+
+        for(dim_t oz = 0; oz < odm[2]; oz++) {
+            const dim_t oZW = oW + oz * ost[2];
+            const dim_t iZW = iW + oz * ist[2];
+
+            for(dim_t oy = 0; oy < odm[1]; oy++) {
+                const dim_t oYZW = oZW + oy * ost[1];
+                const dim_t iYZW = iZW + oy * ist[1];
+
+                for(dim_t ox = 0; ox < odm[0]; ox++) {
+                    const dim_t oMem = oYZW + ox;
+                    const dim_t iMem = iYZW + ox;
+
+                    bool cond = is_upper ? (oy >= ox) : (oy <= ox);
+                    bool do_unit_diag = (is_unit_diag && ox == oy);
+                    if(cond) {
+                        o[oMem] = do_unit_diag ? scalar<T>(1) : i[iMem];
+                    } else {
+                        o[oMem] = scalar<T>(0);
+                    }
+                }
+            }
+        }
+    }
+}
+
+}
+}
+
+#endif
+#endif
diff --git a/src/backend/opencl/debug_opencl.hpp b/src/backend/opencl/debug_opencl.hpp
index 74b3f7cf59..b4126f9abe 100644
--- a/src/backend/opencl/debug_opencl.hpp
+++ b/src/backend/opencl/debug_opencl.hpp
@@ -16,5 +16,10 @@
 #include <iostream>
 #define CL_DEBUG_FINISH(Q) Q.finish()
 #else
-#define CL_DEBUG_FINISH(Q)
+#define CL_DEBUG_FINISH(Q)                      \
+    do {                                        \
+        if(synchronize_calls()) {               \
+            Q.finish();                         \
+        }                                       \
+    } while (false); 
 #endif
diff --git a/src/backend/opencl/err_opencl.hpp b/src/backend/opencl/err_opencl.hpp
index 15855f3b08..955275203a 100644
--- a/src/backend/opencl/err_opencl.hpp
+++ b/src/backend/opencl/err_opencl.hpp
@@ -23,8 +23,8 @@
         char opencl_err_msg[1024];                              \
         snprintf(opencl_err_msg,                                \
                  sizeof(opencl_err_msg),                        \
-                 "OpenCL Error: %s when calling %s",            \
-                 getErrorMessage(ERR.err()).c_str(),            \
+                 "OpenCL Error (%d): %s when calling %s",       \
+                 ERR.err(), getErrorMessage(ERR.err()).c_str(), \
                  ERR.what());                                   \
         if (ERR.err() == CL_MEM_OBJECT_ALLOCATION_FAILURE) {    \
             AF_ERROR(opencl_err_msg, AF_ERR_NO_MEM);            \
diff --git a/src/backend/opencl/inverse.cpp b/src/backend/opencl/inverse.cpp
index eb8348edd4..df955547ba 100644
--- a/src/backend/opencl/inverse.cpp
+++ b/src/backend/opencl/inverse.cpp
@@ -12,6 +12,8 @@
 #include <identity.hpp>
 
 #if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <platform.hpp>
+#include <cpu/cpu_inverse.hpp>
 
 namespace opencl
 {
@@ -19,6 +21,10 @@ namespace opencl
 template<typename T>
 Array<T> inverse(const Array<T> &in)
 {
+    if(OpenCLCPUOffload()) {
+        if (in.dims()[0] == in.dims()[1])
+            return cpu::inverse(in);
+    }
     Array<T> I = identity<T>(in.dims());
     return solve<T>(in, I);
 }
diff --git a/src/backend/opencl/jit.cpp b/src/backend/opencl/jit.cpp
index 66c7c1e9f7..d6ab240fd6 100644
--- a/src/backend/opencl/jit.cpp
+++ b/src/backend/opencl/jit.cpp
@@ -19,6 +19,7 @@
 #include <dispatch.hpp>
 #include <err_opencl.hpp>
 #include <functional>
+#include <af/opencl.h>
 
 namespace opencl
 {
@@ -180,13 +181,16 @@ void evalNodes(Param &out, Node *node)
         uint groups_1 = 1;
         uint num_odims = 4;
 
+        // CPUs seem to perform better with work group size 1024
+        const int work_group_size = (getActiveDeviceType() == AFCL_DEVICE_TYPE_CPU) ? 1024 : 256;
+
         while (num_odims >= 1) {
             if (out.info.dims[num_odims - 1] == 1) num_odims--;
             else break;
         }
 
         if (is_linear) {
-            local_0 = 256;
+            local_0 = work_group_size;
             uint out_elements = out.info.dims[3] * out.info.strides[3];
             uint groups = divup(out_elements, local_0);
 
@@ -194,8 +198,8 @@ void evalNodes(Param &out, Node *node)
             global_0 = divup(groups, global_1) * local_0;
 
         } else {
-            local_0 = 64;
             local_1 =  4;
+            local_0 = work_group_size / local_1;
 
             groups_0 = divup(out.info.dims[0], local_0);
             groups_1 = divup(out.info.dims[1], local_1);
diff --git a/src/backend/opencl/kernel/convolve.hpp b/src/backend/opencl/kernel/convolve.hpp
index 035f4c23aa..6d1d7de7ee 100644
--- a/src/backend/opencl/kernel/convolve.hpp
+++ b/src/backend/opencl/kernel/convolve.hpp
@@ -52,6 +52,7 @@ void convolve_nd(Param out, const Param signal, const Param filter, ConvolveBatc
         case 3: conv3<T, accType, expand>(param, out, signal, filter); break;
     }
 
+    CL_DEBUG_FINISH(getQueue());
     bufferFree(param.impulse);
 }
 
diff --git a/src/backend/opencl/kernel/ireduce.hpp b/src/backend/opencl/kernel/ireduce.hpp
index 0adc0c8e47..17fc460970 100644
--- a/src/backend/opencl/kernel/ireduce.hpp
+++ b/src/backend/opencl/kernel/ireduce.hpp
@@ -281,6 +281,14 @@ namespace kernel
         }
     }
 
+#if defined(__GNUC__) || defined(__GNUG__)
+    /* GCC/G++, Clang/LLVM, Intel ICC */
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wunused-function"
+#else
+    /* Other */
+#endif
+
     template<typename T> double cabs(const T in) { return (double)in; }
     static double cabs(const cfloat in) { return (double)abs(in); }
     static double cabs(const cdouble in) { return (double)abs(in); }
@@ -327,6 +335,12 @@ namespace kernel
         }
     };
 
+#if defined(__GNUC__) || defined(__GNUG__)
+    /* GCC/G++, Clang/LLVM, Intel ICC */
+    #pragma GCC diagnostic pop
+#else
+    /* Other */
+#endif
 
     template<typename T, af_op_t op>
     T ireduce_all(uint *loc, Param in)
diff --git a/src/backend/opencl/kernel/jit.cl b/src/backend/opencl/kernel/jit.cl
index b34bbcddd8..3092449418 100644
--- a/src/backend/opencl/kernel/jit.cl
+++ b/src/backend/opencl/kernel/jit.cl
@@ -8,6 +8,7 @@
  ********************************************************/
 
 #define sign(in) signbit((in))
+#define __noop(a) (a)
 #define __add(lhs, rhs) (lhs) + (rhs)
 #define __sub(lhs, rhs) (lhs) - (rhs)
 #define __mul(lhs, rhs) (lhs) * (rhs)
diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp
index 871370d63b..69c1176210 100644
--- a/src/backend/opencl/kernel/orb.hpp
+++ b/src/backend/opencl/kernel/orb.hpp
@@ -29,8 +29,24 @@ using cl::LocalSpaceArg;
 using cl::NDRange;
 using std::vector;
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#if defined(__clang__)
+    /* Clang/LLVM */
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wsometimes-uninitialized"
+#elif defined(__ICC) || defined(__INTEL_COMPILER)
+    /* Intel ICC/ICPC */
+    // Fix the warning code here, if any
+#elif defined(__GNUC__) || defined(__GNUG__)
+    /* GNU GCC/G++ */
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#elif defined(_MSC_VER)
+    /* Microsoft Visual Studio */
+    #pragma warning( push )
+    #pragma warning( disable : 4700 )
+#else
+    /* Other */
+#endif
 
 namespace opencl
 {
@@ -505,4 +521,19 @@ void orb(unsigned* out_feat,
 } //namespace kernel
 
 } //namespace opencl
-#pragma GCC diagnostic pop
+
+#if defined(__clang__)
+    /* Clang/LLVM */
+    #pragma clang diagnostic pop
+#elif defined(__ICC) || defined(__INTEL_COMPILER)
+    /* Intel ICC/ICPC */
+    // Fix the warning code here, if any
+#elif defined(__GNUC__) || defined(__GNUG__)
+    /* GNU GCC/G++ */
+    #pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+    /* Microsoft Visual Studio */
+    #pragma warning( pop )
+#else
+    /* Other */
+#endif
diff --git a/src/backend/opencl/kernel/select.cl b/src/backend/opencl/kernel/select.cl
index 94a36031c3..03248be1b9 100644
--- a/src/backend/opencl/kernel/select.cl
+++ b/src/backend/opencl/kernel/select.cl
@@ -41,7 +41,7 @@ void select_kernel(__global T *optr, KParam oinfo,
     const int idw = get_group_id(1) / groups_1;
 
     const int group_id_0 = get_group_id(0) - idz * groups_0;
-    const int group_id_1 = get_group_id(1) - idz * groups_1;
+    const int group_id_1 = get_group_id(1) - idw * groups_1;
 
     const int idx = group_id_0 * get_local_size(0) + get_local_id(0);
     const int idy = group_id_1 * get_local_size(1) + get_local_id(1);
@@ -80,7 +80,7 @@ void select_scalar_kernel(__global T *optr, KParam oinfo,
     const int idw = get_group_id(1) / groups_1;
 
     const int group_id_0 = get_group_id(0) - idz * groups_0;
-    const int group_id_1 = get_group_id(1) - idz * groups_1;
+    const int group_id_1 = get_group_id(1) - idw * groups_1;
 
     const int idx = group_id_0 * get_local_size(0) + get_local_id(0);
     const int idy = group_id_1 * get_local_size(1) + get_local_id(1);
diff --git a/src/backend/opencl/kernel/transform.cl b/src/backend/opencl/kernel/transform.cl
index 824f50cc5d..c44c18457a 100644
--- a/src/backend/opencl/kernel/transform.cl
+++ b/src/backend/opencl/kernel/transform.cl
@@ -11,8 +11,27 @@
 #define BILINEAR transform_b
 #define LOWER transform_l
 
-void calc_affine_inverse(float* txo, __global const float* txi)
+void calc_transf_inverse(float* txo, __global const float* txi)
 {
+#if PERSPECTIVE
+    txo[0] =   txi[4]*txi[8] - txi[5]*txi[7];
+    txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]);
+    txo[2] =   txi[1]*txi[5] - txi[2]*txi[4];
+
+    txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]);
+    txo[4] =   txi[0]*txi[8] - txi[2]*txi[6];
+    txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]);
+
+    txo[6] =   txi[3]*txi[7] - txi[4]*txi[6];
+    txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]);
+    txo[8] =   txi[0]*txi[4] - txi[1]*txi[3];
+
+    float det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6];
+
+    txo[0] /= det; txo[1] /= det; txo[2] /= det;
+    txo[3] /= det; txo[4] /= det; txo[5] /= det;
+    txo[6] /= det; txo[7] /= det; txo[8] /= det;
+#else
     float det = txi[0]*txi[4] - txi[1]*txi[3];
 
     txo[0] = txi[4] / det;
@@ -22,6 +41,7 @@ void calc_affine_inverse(float* txo, __global const float* txi)
 
     txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1];
     txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4];
+#endif
 }
 
 __kernel
@@ -59,17 +79,17 @@ void transform_kernel(__global T *d_out, const KParam out,
 
     // Transform is in global memory.
     // Needs offset to correct transform being processed.
-    __global const float *tmat_ptr = c_tmat + t_idx * 6;
-    float tmat[6];
+    __global const float *tmat_ptr = c_tmat + t_idx * TRANSF_LEN;
+    float tmat[TRANSF_LEN];
 
     // We expect a inverse transform matrix by default
     // If it is an forward transform, then we need its inverse
     if(INVERSE == 1) {
-        #pragma unroll
-        for(int i = 0; i < 6; i++)
+        #pragma unroll 3
+        for(int i = 0; i < TRANSF_LEN; i++)
             tmat[i] = tmat_ptr[i];
     } else {
-        calc_affine_inverse(tmat, tmat_ptr);
+        calc_transf_inverse(tmat, tmat_ptr);
     }
 
     if (xido >= out.dims[0] && yido >= out.dims[1]) return;
diff --git a/src/backend/opencl/kernel/transform.hpp b/src/backend/opencl/kernel/transform.hpp
index 677acc31fe..f78c7b0ebe 100644
--- a/src/backend/opencl/kernel/transform.hpp
+++ b/src/backend/opencl/kernel/transform.hpp
@@ -50,7 +50,7 @@ namespace opencl
                                             >::type;
 
 
-        template<typename T, bool isInverse, af_interp_type method>
+        template<typename T, bool isInverse, bool isPerspective, af_interp_type method>
         void transform(Param out, const Param in, const Param tf)
         {
             try {
@@ -64,11 +64,13 @@ namespace opencl
                 std::call_once( compileFlags[device], [device] () {
                     ToNum<T> toNum;
                     std::ostringstream options;
-                    options << " -D T="        << dtype_traits<T>::getName()
-                            << " -D INVERSE="  << (isInverse ? 1 : 0)
-                            << " -D ZERO="     << toNum(scalar<T>(0));
-                    options << " -D VT="       << dtype_traits<vtype_t<T>>::getName();
-                    options << " -D WT="       << dtype_traits<wtype_t<BT>>::getName();
+                    options << " -D T="           << dtype_traits<T>::getName()
+                            << " -D INVERSE="     << (isInverse ? 1 : 0)
+                            << " -D PERSPECTIVE=" << (isPerspective ? 1 : 0)
+                            << " -D TRANSF_LEN="  << (isPerspective ? 9 : 6)
+                            << " -D ZERO="        << toNum(scalar<T>(0));
+                    options << " -D VT="          << dtype_traits<vtype_t<T>>::getName();
+                    options << " -D WT="          << dtype_traits<wtype_t<BT>>::getName();
 
                     if((af_dtype) dtype_traits<T>::af_type == c32 ||
                        (af_dtype) dtype_traits<T>::af_type == c64) {
diff --git a/src/backend/opencl/kernel/transform_interp.cl b/src/backend/opencl/kernel/transform_interp.cl
index 1d82951b9d..a083df0ff6 100644
--- a/src/backend/opencl/kernel/transform_interp.cl
+++ b/src/backend/opencl/kernel/transform_interp.cl
@@ -25,12 +25,23 @@ void transform_n(__global T *d_out, const KParam out, __global const T *d_in, co
                  const float *tmat, const int xido, const int yido, const int nimages)
 {
     // Compute input index
-    const int xidi = round(xido * tmat[0]
-                         + yido * tmat[1]
-                                + tmat[2]);
-    const int yidi = round(xido * tmat[3]
-                         + yido * tmat[4]
-                                + tmat[5]);
+    int xidi = 0, yidi = 0;
+#if PERSPECTIVE
+    const float W = xido * tmat[6] + yido * tmat[7] + tmat[8];
+    xidi = round((xido * tmat[0]
+                + yido * tmat[1]
+                       + tmat[2]) / W);
+    yidi = round((xido * tmat[3]
+                + yido * tmat[4]
+                       + tmat[5]) / W);
+#else
+    xidi = round(xido * tmat[0]
+               + yido * tmat[1]
+                      + tmat[2]);
+    yidi = round(xido * tmat[3]
+               + yido * tmat[4]
+                      + tmat[5]);
+#endif
 
     // Compute memory location of indices
     const int loci = yidi * in.strides[1]  + xidi;
@@ -54,12 +65,23 @@ void transform_b(__global T *d_out, const KParam out, __global const T *d_in, co
     const int loco = (yido * out.strides[1] + xido);
 
     // Compute input index
-    const float xid = xido * tmat[0]
-                    + yido * tmat[1]
-                           + tmat[2];
-    const float yid = xido * tmat[3]
-                    + yido * tmat[4]
-                           + tmat[5];
+    float xid = 0.0f, yid = 0.0f;
+#if PERSPECTIVE
+    const float W = xido * tmat[6] + yido * tmat[7] + tmat[8];
+    xid = (xido * tmat[0]
+         + yido * tmat[1]
+                + tmat[2]) / W;
+    yid = (xido * tmat[3]
+         + yido * tmat[4]
+                + tmat[5]) / W;
+#else
+    xid = xido * tmat[0]
+        + yido * tmat[1]
+               + tmat[2];
+    yid = xido * tmat[3]
+        + yido * tmat[4]
+               + tmat[5];
+#endif
 
     T zero = ZERO;
     if (xid < -0.001 || yid < -0.001 || in.dims[0] < xid || in.dims[1] < yid) {
@@ -104,12 +126,23 @@ void transform_l(__global T *d_out, const KParam out, __global const T *d_in, co
                  const float *tmat, const int xido, const int yido, const int nimages)
 {
     // Compute input index
-    const int xidi = floor(xido * tmat[0]
-                         + yido * tmat[1]
-                                + tmat[2]);
-    const int yidi = floor(xido * tmat[3]
-                         + yido * tmat[4]
-                                + tmat[5]);
+    int xidi = 0, yidi = 0;
+#if PERSPECTIVE
+    const float W = xido * tmat[6] + yido * tmat[7] + tmat[8];
+    xidi = floor((xido * tmat[0]
+                + yido * tmat[1]
+                       + tmat[2]) / W);
+    yidi = floor((xido * tmat[3]
+                + yido * tmat[4]
+                       + tmat[5]) / W);
+#else
+    xidi = floor(xido * tmat[0]
+               + yido * tmat[1]
+                      + tmat[2]);
+    yidi = floor(xido * tmat[3]
+               + yido * tmat[4]
+                      + tmat[5]);
+#endif
 
     // Compute memory location of indices
     const int loci = yidi * in.strides[1]  + xidi;
diff --git a/src/backend/opencl/kernel/where.hpp b/src/backend/opencl/kernel/where.hpp
index 2cbf8c1019..2b1308fcec 100644
--- a/src/backend/opencl/kernel/where.hpp
+++ b/src/backend/opencl/kernel/where.hpp
@@ -159,7 +159,9 @@ namespace kernel
                 out.info.strides[k] = total;
             }
 
-            get_out_idx<T>(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y);
+            if (total > 0) {
+                get_out_idx<T>(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y);
+            }
 
             bufferFree(rtmp.data);
             bufferFree(otmp.data);
diff --git a/src/backend/opencl/lu.cpp b/src/backend/opencl/lu.cpp
index ee76f47201..0bc6bd5283 100644
--- a/src/backend/opencl/lu.cpp
+++ b/src/backend/opencl/lu.cpp
@@ -14,7 +14,9 @@
 #include <kernel/lu_split.hpp>
 #include <copy.hpp>
 #include <blas.hpp>
+#include <platform.hpp>
 #include <magma/magma.h>
+#include <cpu/cpu_lu.hpp>
 
 namespace opencl
 {
@@ -41,8 +43,11 @@ Array<int> convertPivot(int *ipiv, int in_sz, int out_sz)
 template<typename T>
 void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
 {
-
     try {
+        if(OpenCLCPUOffload()) {
+            return cpu::lu(lower, upper, pivot, in);
+        }
+
         dim4 iDims = in.dims();
         int M = iDims[0];
         int N = iDims[1];
@@ -67,6 +72,10 @@ template<typename T>
 Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
 {
     try {
+        if(OpenCLCPUOffload()) {
+            return cpu::lu_inplace(in, convert_pivot);
+        }
+
         initBlas();
         dim4 iDims = in.dims();
         int M = iDims[0];
@@ -88,6 +97,11 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
     }
 }
 
+bool isLAPACKAvailable()
+{
+    return true;
+}
+
 #define INSTANTIATE_LU(T)                                                                           \
     template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
     template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
@@ -116,6 +130,11 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
     AF_ERROR("Linear Algebra is disabled on OpenCL", AF_ERR_NOT_CONFIGURED);
 }
 
+bool isLAPACKAvailable()
+{
+    return false;
+}
+
 #define INSTANTIATE_LU(T)                                                                           \
     template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
     template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
diff --git a/src/backend/opencl/lu.hpp b/src/backend/opencl/lu.hpp
index af43f24614..b44eca8c60 100644
--- a/src/backend/opencl/lu.hpp
+++ b/src/backend/opencl/lu.hpp
@@ -17,4 +17,6 @@ namespace opencl
 
     template<typename T>
     Array<int> lu_inplace(Array<T> &in, const bool convert_pivot = true);
+
+    bool isLAPACKAvailable();
 }
diff --git a/src/backend/opencl/magma/getrs.cpp b/src/backend/opencl/magma/getrs.cpp
index 1dc106c0c5..eb28a5175a 100644
--- a/src/backend/opencl/magma/getrs.cpp
+++ b/src/backend/opencl/magma/getrs.cpp
@@ -61,6 +61,7 @@
 #include <platform.hpp>
 #include <algorithm>
 #include <string>
+#include <af/opencl.h>
 
 template<typename Ty>  magma_int_t
 magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
@@ -168,8 +169,7 @@ magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
     clblasTranspose cltrans =(trans == MagmaNoTrans) ? clblasNoTrans :
         (trans == MagmaTrans ? clblasTrans : clblasConjTrans);
 
-    std::string pName = opencl::getPlatformName(opencl::getDevice());
-    bool cond = pName.find("NVIDIA") != std::string::npos;
+    bool cond = opencl::getActivePlatform() == AFCL_PLATFORM_NVIDIA;
     cl_mem dAT = 0;
     if (nrhs > 1 && cond) {
         magma_malloc<Ty>(&dAT, n * n);
diff --git a/src/backend/opencl/magma/magma_cpu_blas.h b/src/backend/opencl/magma/magma_cpu_blas.h
index b3cba096b5..6661aad657 100644
--- a/src/backend/opencl/magma/magma_cpu_blas.h
+++ b/src/backend/opencl/magma/magma_cpu_blas.h
@@ -13,16 +13,16 @@
 #include <defines.hpp>
 #include "magma_types.h"
 
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#else
 #ifdef USE_MKL
-#include <mkl_cblas.h>
+    #include <mkl_cblas.h>
 #else
-extern "C" {
-#include <cblas.h>
-}
-#endif
+    #ifdef __APPLE__
+        #include <Accelerate/Accelerate.h>
+    #else
+        extern "C" {
+            #include <cblas.h>
+        }
+    #endif
 #endif
 
 // Todo: Ask upstream for a more official way to detect it
diff --git a/src/backend/opencl/magma/magma_cpu_lapack.h b/src/backend/opencl/magma/magma_cpu_lapack.h
index 5974dab8a9..54c26ae0e9 100644
--- a/src/backend/opencl/magma/magma_cpu_lapack.h
+++ b/src/backend/opencl/magma/magma_cpu_lapack.h
@@ -39,16 +39,20 @@ int LAPACKE_dlacgv_work(Args... args) { return 0; }
 #define ORDER_TYPE int
 #define LAPACK_NAME(fn) LAPACKE_##fn
 
-#if defined(__APPLE__)
-    #define LAPACK_COL_MAJOR 102
-    #include "../../lapacke.hpp"
+#ifdef USE_MKL
+    #include<mkl_lapacke.h>
 #else
-    #ifdef USE_MKL
-        #include<mkl_lapacke.h>
+    #ifdef __APPLE__
+        #include <Accelerate/Accelerate.h>
+        #include <lapacke.hpp>
+        #undef LAPACK_COL_MAJOR
+        #define LAPACK_COL_MAJOR 102
+        #undef AF_LAPACK_COL_MAJOR
+        #define AF_LAPACK_COL_MAJOR 0
     #else // NETLIB LAPACKE
         #include<lapacke.h>
-    #endif  // MKL/NETLIB
-#endif  //APPLE
+    #endif
+#endif
 
 #define LAPACKE_CHECK(fn) do {                  \
         int __info = fn;                        \
diff --git a/src/backend/opencl/magma/magma_helper.cpp b/src/backend/opencl/magma/magma_helper.cpp
index 584a412191..481f08c346 100644
--- a/src/backend/opencl/magma/magma_helper.cpp
+++ b/src/backend/opencl/magma/magma_helper.cpp
@@ -159,6 +159,14 @@ magma_int_t magma_get_geqrf_nb<magmaDoubleComplex>( magma_int_t m )
     else                return 128;
 }
 
+#if defined(__GNUC__) || defined(__GNUG__)
+    /* GCC/G++, Clang/LLVM, Intel ICC */
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wmissing-braces"
+#else
+    /* Other */
+#endif
+
 template<typename T> T magma_make(double r, double i) { return (T) r; }
 template float magma_make<float>(double r, double i);
 template double magma_make<double>(double r, double i);
@@ -172,3 +180,10 @@ template<> magmaDoubleComplex magma_make<magmaDoubleComplex>(double r, double i)
     magmaDoubleComplex tmp = {r, i};
     return tmp;
 }
+
+#if defined(__GNUC__) || defined(__GNUG__)
+    /* GCC/G++, Clang/LLVM, Intel ICC */
+    #pragma GCC diagnostic pop
+#else
+    /* Other */
+#endif
diff --git a/src/backend/opencl/magma/potrf.cpp b/src/backend/opencl/magma/potrf.cpp
index d048ed4dac..4f9984f325 100644
--- a/src/backend/opencl/magma/potrf.cpp
+++ b/src/backend/opencl/magma/potrf.cpp
@@ -199,7 +199,7 @@ magma_int_t magma_potrf_gpu(
                 magma_getmatrix_async<Ty>(jb, jb, dA(j,j), ldda, work, jb, queue, &event);
 
                 // apply all previous updates to block row right of diagonal block
-                if (j+jb < n) {
+                if (j+jb < n && j > 0) {
                     CLBLAS_CHECK(gpu_blas_gemm(
                                      transType, clblasNoTrans,
                                      jb, n-j-jb, j,
@@ -259,7 +259,7 @@ magma_int_t magma_potrf_gpu(
                 magma_getmatrix_async<Ty>(jb, jb, dA(j,j), ldda, work, jb, queue, &event);
 
                 // apply all previous updates to block column below diagonal block
-                if (j+jb < n) {
+                if (j+jb < n && j > 0) {
                     CLBLAS_CHECK(gpu_blas_gemm(
                                      clblasNoTrans, transType,
                                      n-j-jb, jb, j,
diff --git a/src/backend/opencl/math.hpp b/src/backend/opencl/math.hpp
index 9292d398a0..f090062b03 100644
--- a/src/backend/opencl/math.hpp
+++ b/src/backend/opencl/math.hpp
@@ -17,6 +17,14 @@
 #include "backend.hpp"
 #include "types.hpp"
 
+#if defined(__GNUC__) || defined(__GNUG__)
+    /* GCC/G++, Clang/LLVM, Intel ICC */
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wunused-function"
+#else
+    /* Other */
+#endif
+
 namespace opencl
 {
 
@@ -123,3 +131,10 @@ namespace opencl
     cfloat operator *(cfloat a, cfloat b);
     cdouble operator *(cdouble a, cdouble b);
 }
+
+#if defined(__GNUC__) || defined(__GNUG__)
+    /* GCC/G++, Clang/LLVM, Intel ICC */
+    #pragma GCC diagnostic pop
+#else
+    /* Other */
+#endif
diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp
index f4c740482e..5df64d6d86 100644
--- a/src/backend/opencl/memory.cpp
+++ b/src/backend/opencl/memory.cpp
@@ -10,344 +10,265 @@
 #include <memory.hpp>
 #include <dispatch.hpp>
 #include <map>
+#include <iostream>
+#include <iomanip>
+#include <string>
 #include <types.hpp>
+#include <err_opencl.hpp>
 
-namespace opencl
-{
-    static size_t memory_resolution = 1024; //1KB
-
-    void setMemStepSize(size_t step_bytes)
-    {
-        memory_resolution = step_bytes;
-    }
+#include <MemoryManager.hpp>
 
-    size_t getMemStepSize(void)
-    {
-        return memory_resolution;
-    }
-
-    // Manager Class
-    // Dummy used to call garbage collection at the end of the program
-    class Manager
-    {
-        public:
-        static bool initialized;
-        Manager()
-        {
-            initialized = true;
-        }
+#ifndef AF_MEM_DEBUG
+#define AF_MEM_DEBUG 0
+#endif
 
-        ~Manager()
-        {
-            for(int i = 0; i < (int)getDeviceCount(); i++) {
-                setDevice(i);
-                garbageCollect();
-                pinnedGarbageCollect();
-            }
-        }
-    };
+#ifndef AF_OPENCL_MEM_DEBUG
+#define AF_OPENCL_MEM_DEBUG 0
+#endif
 
-    bool Manager::initialized = false;
+namespace opencl
+{
 
-    static void managerInit()
+class MemoryManager  : public common::MemoryManager
+{
+    int getActiveDeviceId();
+    size_t getMaxMemorySize(int id);
+public:
+    MemoryManager();
+    void *nativeAlloc(const size_t bytes);
+    void nativeFree(void *ptr);
+    ~MemoryManager()
     {
-        if(Manager::initialized == false)
-            static Manager pm = Manager();
+        common::lock_guard_t lock(this->memory_mutex);
+        for (int n = 0; n < getDeviceCount(); n++) {
+            opencl::setDevice(n);
+            this->garbageCollect();
+        }
     }
+};
 
-    typedef struct
-    {
-        bool is_free;
-        bool is_unlinked;
-        size_t bytes;
-    } mem_info;
+class MemoryManagerPinned  : public common::MemoryManager
+{
+    std::vector<
+        std::map<void *, cl::Buffer>
+        > pinned_maps;
+    int getActiveDeviceId();
+    size_t getMaxMemorySize(int id);
 
-    static size_t used_bytes[DeviceManager::MAX_DEVICES] = {0};
-    static size_t used_buffers[DeviceManager::MAX_DEVICES] = {0};
-    static size_t total_bytes[DeviceManager::MAX_DEVICES] = {0};
+public:
 
-    typedef std::map<cl::Buffer *, mem_info> mem_t;
-    typedef mem_t::iterator mem_iter;
-    mem_t memory_maps[DeviceManager::MAX_DEVICES];
+    MemoryManagerPinned();
 
-    static void destroy(cl::Buffer *ptr)
-    {
-        delete ptr;
-    }
+    void *nativeAlloc(const size_t bytes);
+    void nativeFree(void *ptr);
 
-    void garbageCollect()
+    ~MemoryManagerPinned()
     {
-        int n = getActiveDeviceId();
-        for(mem_iter iter = memory_maps[n].begin();
-            iter != memory_maps[n].end(); ++iter) {
-
-            if ((iter->second).is_free) {
-
-                if (!(iter->second).is_unlinked) {
-                    destroy(iter->first);
-                    total_bytes[n] -= iter->second.bytes;
-                }
-            }
-        }
-
-        mem_iter memory_curr = memory_maps[n].begin();
-        mem_iter memory_end  = memory_maps[n].end();
-
-        while(memory_curr != memory_end) {
-            if (memory_curr->second.is_free  && !memory_curr->second.is_unlinked) {
-                memory_curr = memory_maps[n].erase(memory_curr);
-            } else {
-                ++memory_curr;
+        common::lock_guard_t lock(this->memory_mutex);
+        for (int n = 0; n < getDeviceCount(); n++) {
+            opencl::setDevice(n);
+            this->garbageCollect();
+            auto pinned_curr_iter = pinned_maps[n].begin();
+            auto pinned_end_iter  = pinned_maps[n].end();
+            while (pinned_curr_iter != pinned_end_iter) {
+                pinned_maps[n].erase(pinned_curr_iter++);
             }
         }
     }
+};
 
-    cl::Buffer *bufferAlloc(const size_t &bytes)
-    {
-        int n = getActiveDeviceId();
-        cl::Buffer *ptr = NULL;
-        size_t alloc_bytes = divup(bytes, memory_resolution) * memory_resolution;
-
-        if (bytes > 0) {
-
-            // FIXME: Add better checks for garbage collection
-            // Perhaps look at total memory available as a metric
-            if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes[n] >= MAX_BYTES) {
-                garbageCollect();
-            }
-
-            for(mem_iter iter = memory_maps[n].begin();
-                iter != memory_maps[n].end(); ++iter) {
-
-                mem_info info = iter->second;
-
-                if ( info.is_free &&
-                    !info.is_unlinked &&
-                     info.bytes == alloc_bytes) {
+int MemoryManager::getActiveDeviceId()
+{
+    return opencl::getActiveDeviceId();
+}
 
-                    iter->second.is_free = false;
-                    used_bytes[n] += alloc_bytes;
-                    used_buffers[n]++;
-                    return iter->first;
-                }
-            }
+size_t MemoryManager::getMaxMemorySize(int id)
+{
+    return opencl::getDeviceMemorySize(id);
+}
 
-            try {
-                ptr = new cl::Buffer(getContext(), CL_MEM_READ_WRITE, alloc_bytes);
-            } catch(...) {
-                garbageCollect();
-                ptr = new cl::Buffer(getContext(), CL_MEM_READ_WRITE, alloc_bytes);
-            }
+MemoryManager::MemoryManager() :
+    common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG)
+{
+    this->setMaxMemorySize();
+}
 
-            mem_info info = {false, false, alloc_bytes};
-            memory_maps[n][ptr] = info;
-            used_bytes[n] += alloc_bytes;
-            used_buffers[n]++;
-            total_bytes[n] += alloc_bytes;
-        }
-        return ptr;
+void *MemoryManager::nativeAlloc(const size_t bytes)
+{
+    try {
+        return (void *)(new cl::Buffer(getContext(), CL_MEM_READ_WRITE, bytes));
+    } catch(cl::Error err) {
+        CL_TO_AF_ERROR(err);
     }
+}
 
-    void bufferFree(cl::Buffer *ptr)
-    {
-        int n = getActiveDeviceId();
-        mem_iter iter = memory_maps[n].find(ptr);
-
-        if (iter != memory_maps[n].end()) {
-
-            iter->second.is_free = true;
-            if ((iter->second).is_unlinked) return;
-
-            used_bytes[n] -= iter->second.bytes;
-            used_buffers[n]--;
-        } else {
-            destroy(ptr); // Free it because we are not sure what the size is
-        }
+void MemoryManager::nativeFree(void *ptr)
+{
+    try {
+        delete (cl::Buffer *)ptr;
+    } catch(cl::Error err) {
+        CL_TO_AF_ERROR(err);
     }
+}
 
-    void bufferPop(cl::Buffer *ptr)
-    {
-        int n = getActiveDeviceId();
-        mem_iter iter = memory_maps[n].find(ptr);
-
-        if (iter != memory_maps[n].end()) {
-            iter->second.is_unlinked = true;
-        } else {
-
-            mem_info info = { false,
-                              false,
-                              100 }; //This number is not relevant
+static MemoryManager &getMemoryManager()
+{
+    static MemoryManager instance;
+    return instance;
+}
 
-            memory_maps[n][ptr] = info;
-        }
-    }
+int MemoryManagerPinned::getActiveDeviceId()
+{
+    return opencl::getActiveDeviceId();
+}
 
-    void bufferPush(cl::Buffer *ptr)
-    {
-        int n = getActiveDeviceId();
-        mem_iter iter = memory_maps[n].find(ptr);
+size_t MemoryManagerPinned::getMaxMemorySize(int id)
+{
+    return opencl::getDeviceMemorySize(id);
+}
 
-        if (iter != memory_maps[n].end()) {
-            iter->second.is_unlinked = false;
-        }
-    }
+MemoryManagerPinned::MemoryManagerPinned() :
+    common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG),
+    pinned_maps(getDeviceCount())
+{
+    this->setMaxMemorySize();
+}
 
-    void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
-                          size_t *lock_bytes,  size_t *lock_buffers)
-    {
-        int n = getActiveDeviceId();
-        if (alloc_bytes   ) *alloc_bytes   = total_bytes[n];
-        if (alloc_buffers ) *alloc_buffers = memory_maps[n].size();
-        if (lock_bytes    ) *lock_bytes    = used_bytes[n];
-        if (lock_buffers  ) *lock_buffers  = used_buffers[n];
+void *MemoryManagerPinned::nativeAlloc(const size_t bytes)
+{
+    void *ptr = NULL;
+    try {
+        cl::Buffer buf= cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, bytes);
+        ptr = getQueue().enqueueMapBuffer(buf, true, CL_MAP_READ | CL_MAP_WRITE, 0, bytes);
+        pinned_maps[opencl::getActiveDeviceId()][ptr] = buf;
+    } catch(cl::Error err) {
+        CL_TO_AF_ERROR(err);
     }
+    return ptr;
+}
 
-    template<typename T>
-    T *memAlloc(const size_t &elements)
-    {
-        managerInit();
-        return (T *)bufferAlloc(elements * sizeof(T));
-    }
+void MemoryManagerPinned::nativeFree(void *ptr)
+{
+    try {
+        int n = opencl::getActiveDeviceId();
+        auto iter = pinned_maps[n].find(ptr);
 
-    template<typename T>
-    void memFree(T *ptr)
-    {
-        return bufferFree((cl::Buffer *)ptr);
-    }
+        if (iter != pinned_maps[n].end()) {
+            getQueue().enqueueUnmapMemObject(pinned_maps[n][ptr], ptr);
+            pinned_maps[n].erase(iter);
+        }
 
-    template<typename T>
-    void memPop(const T *ptr)
-    {
-        return bufferPop((cl::Buffer *)ptr);
+    } catch(cl::Error err) {
+        CL_TO_AF_ERROR(err);
     }
+}
 
-    template<typename T>
-    void memPush(const T *ptr)
-    {
-        return bufferPush((cl::Buffer *)ptr);
-    }
+static MemoryManagerPinned &getMemoryManagerPinned()
+{
+    static MemoryManagerPinned instance;
+    return instance;
+}
 
-    // pinned memory manager
-    typedef struct {
-        cl::Buffer *buf;
-        mem_info info;
-    } pinned_info;
+void setMemStepSize(size_t step_bytes)
+{
+    getMemoryManager().setMemStepSize(step_bytes);
+}
 
-    typedef std::map<void*, pinned_info> pinned_t;
-    typedef pinned_t::iterator pinned_iter;
-    pinned_t pinned_maps[DeviceManager::MAX_DEVICES];
-    static size_t pinned_used_bytes = 0;
+size_t getMemStepSize(void)
+{
+    return getMemoryManager().getMemStepSize();
+}
 
-    static void pinnedDestroy(cl::Buffer *buf, void *ptr)
-    {
-        getQueue().enqueueUnmapMemObject(*buf, (void *)ptr);
-        destroy(buf);
-    }
+size_t getMaxBytes()
+{
+    return getMemoryManager().getMaxBytes();
+}
 
-    void pinnedGarbageCollect()
-    {
-        int n = getActiveDeviceId();
-        for(auto &iter : pinned_maps[n]) {
-            if ((iter.second).info.is_free) {
-                pinnedDestroy(iter.second.buf, iter.first);
-            }
-        }
+unsigned getMaxBuffers()
+{
+    return getMemoryManager().getMaxBuffers();
+}
 
-        pinned_iter memory_curr = pinned_maps[n].begin();
-        pinned_iter memory_end  = pinned_maps[n].end();
+void garbageCollect()
+{
+    getMemoryManager().garbageCollect();
+}
 
-        while(memory_curr != memory_end) {
-            if (memory_curr->second.info.is_free) {
-                memory_curr = pinned_maps[n].erase(memory_curr);
-            } else {
-                ++memory_curr;
-            }
-        }
+void printMemInfo(const char *msg, const int device)
+{
+    getMemoryManager().printInfo(msg, device);
+}
 
-    }
+template<typename T>
+T* memAlloc(const size_t &elements)
+{
+    return (T *)getMemoryManager().alloc(elements * sizeof(T), false);
+}
 
-    void *pinnedBufferAlloc(const size_t &bytes)
-    {
-        void *ptr = NULL;
-        int n = getActiveDeviceId();
-        // Allocate the higher megabyte. Overhead of creating pinned memory is
-        // more so we want more resuable memory.
-        size_t alloc_bytes = divup(bytes, 1048576) * 1048576;
-
-        if (bytes > 0) {
-            cl::Buffer *buf = NULL;
-
-            // FIXME: Add better checks for garbage collection
-            // Perhaps look at total memory available as a metric
-            if (pinned_maps[n].size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
-                pinnedGarbageCollect();
-            }
+void* memAllocUser(const size_t &bytes)
+{
+    return getMemoryManager().alloc(bytes, true);
+}
+template<typename T>
+void memFree(T *ptr)
+{
+    return getMemoryManager().unlock((void *)ptr, false);
+}
 
-            for(pinned_iter iter = pinned_maps[n].begin();
-                iter != pinned_maps[n].end(); ++iter) {
+void memFreeUser(void *ptr)
+{
+    getMemoryManager().unlock((void *)ptr, true);
+}
 
-                mem_info info = iter->second.info;
-                if (info.is_free && info.bytes == alloc_bytes) {
-                    iter->second.info.is_free = false;
-                    pinned_used_bytes += alloc_bytes;
-                    return iter->first;
-                }
-            }
+cl::Buffer *bufferAlloc(const size_t &bytes)
+{
+    return (cl::Buffer *)getMemoryManager().alloc(bytes, false);
+}
 
-            try {
-                buf = new cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, alloc_bytes);
+void bufferFree(cl::Buffer *buf)
+{
+    return getMemoryManager().unlock((void *)buf, false);
+}
 
-                ptr = getQueue().enqueueMapBuffer(*buf, true, CL_MAP_READ|CL_MAP_WRITE,
-                                                  0, alloc_bytes);
-            } catch(...) {
-                pinnedGarbageCollect();
-                buf = new cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, alloc_bytes);
+void memLock(const void *ptr)
+{
+    getMemoryManager().userLock((void *)ptr);
+}
 
-                ptr = getQueue().enqueueMapBuffer(*buf, true, CL_MAP_READ|CL_MAP_WRITE,
-                                                  0, alloc_bytes);
-            }
-            mem_info info = {false, false, alloc_bytes};
-            pinned_info pt = {buf, info};
-            pinned_maps[n][ptr] = pt;
-            pinned_used_bytes += alloc_bytes;
-        }
-        return ptr;
-    }
+void memUnlock(const void *ptr)
+{
+    getMemoryManager().userUnlock((void *)ptr);
+}
 
-    void pinnedBufferFree(void *ptr)
-    {
-        int n = getActiveDeviceId();
-        pinned_iter iter = pinned_maps[n].find(ptr);
+void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
+                      size_t *lock_bytes,  size_t *lock_buffers)
+{
+    getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers,
+                                  lock_bytes,  lock_buffers);
+}
 
-        if (iter != pinned_maps[n].end()) {
-            iter->second.info.is_free = true;
-            pinned_used_bytes -= iter->second.info.bytes;
-        } else {
-            pinnedDestroy(iter->second.buf, ptr); // Free it because we are not sure what the size is
-            pinned_maps[n].erase(iter);
-        }
-    }
+template<typename T>
+T* pinnedAlloc(const size_t &elements)
+{
+    return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T), false);
+}
 
-    template<typename T>
-    T* pinnedAlloc(const size_t &elements)
-    {
-        managerInit();
-        return (T *)pinnedBufferAlloc(elements * sizeof(T));
-    }
+template<typename T>
+void pinnedFree(T* ptr)
+{
+    return getMemoryManagerPinned().unlock((void *)ptr, false);
+}
 
-    template<typename T>
-    void pinnedFree(T* ptr)
-    {
-        return pinnedBufferFree((void *) ptr);
-    }
+bool checkMemoryLimit()
+{
+    return getMemoryManager().checkMemoryLimit();
+}
 
-#define INSTANTIATE(T)                                  \
-    template T* memAlloc(const size_t &elements);       \
-    template void memFree(T* ptr);                      \
-    template void memPop(const T* ptr);                 \
-    template void memPush(const T* ptr);                \
-    template T* pinnedAlloc(const size_t &elements);    \
-    template void pinnedFree(T* ptr);                   \
+#define INSTANTIATE(T)                                      \
+    template T* memAlloc(const size_t &elements);           \
+    template void memFree(T* ptr);                          \
+    template T* pinnedAlloc(const size_t &elements);        \
+    template void pinnedFree(T* ptr);                       \
 
     INSTANTIATE(float)
     INSTANTIATE(cfloat)
diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp
index c315a9a2f6..a02d387591 100644
--- a/src/backend/opencl/memory.hpp
+++ b/src/backend/opencl/memory.hpp
@@ -17,22 +17,32 @@ namespace opencl
     cl::Buffer *bufferAlloc(const size_t &bytes);
     void bufferFree(cl::Buffer *buf);
 
-    template<typename T> T *memAlloc(const size_t &elements);
-    template<typename T> void memFree(T *ptr);
-    template<typename T> void memPop(const T *ptr);
-    template<typename T> void memPush(const T *ptr);
+    template<typename T> T* memAlloc(const size_t &elements);
+    void *memAllocUser(const size_t &bytes);
+
+    // Need these as 2 separate function and not a default argument
+    // This is because it is used as the deleter in shared pointer
+    // which cannot support default arguments
+    template<typename T> void memFree(T* ptr);
+    void memFreeUser(void* ptr);
+
+    void memLock(const void *ptr);
+    void memUnlock(const void *ptr);
 
     template<typename T> T* pinnedAlloc(const size_t &elements);
     template<typename T> void pinnedFree(T* ptr);
 
-    static const unsigned MAX_BUFFERS   = 100;
-    static const unsigned MAX_BYTES     = (1 << 30);
+    size_t getMaxBytes();
+    unsigned getMaxBuffers();
 
     void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
                           size_t *lock_bytes,  size_t *lock_buffers);
     void garbageCollect();
     void pinnedGarbageCollect();
 
+    void printMemInfo(const char *msg, const int device);
+
     void setMemStepSize(size_t step_bytes);
     size_t getMemStepSize(void);
+    bool checkMemoryLimit();
 }
diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp
index 16fb3e0d34..dc8ab4ea65 100644
--- a/src/backend/opencl/platform.cpp
+++ b/src/backend/opencl/platform.cpp
@@ -29,6 +29,7 @@
 #include <defines.hpp>
 #include <version.hpp>
 #include <platform.hpp>
+#include <util.hpp>
 #include <functional>
 #include <algorithm>
 #include <cctype>
@@ -41,6 +42,8 @@
 #include <map>
 #include <errorcodes.hpp>
 #include <err_opencl.hpp>
+#include <util.hpp>
+#include <host_memory.hpp>
 
 using std::string;
 using std::vector;
@@ -99,7 +102,6 @@ DeviceManager::~DeviceManager()
     for (auto q: mQueues) delete q;
     for (auto d : mDevices) delete d;
     for (auto c : mContexts) delete c;
-    for (auto p : mPlatforms) delete p;
 #endif
 }
 
@@ -109,56 +111,182 @@ void DeviceManager::setContext(int device)
     mActiveCtxId = device;
 }
 
+static inline bool verify_present(std::string pname, const char *ref)
+{
+    return pname.find(ref) != std::string::npos;
+}
+
+static inline bool compare_default(const Device *ldev, const Device *rdev)
+{
+    const cl_device_type device_types[] = {CL_DEVICE_TYPE_GPU,
+                                           CL_DEVICE_TYPE_ACCELERATOR};
+
+    auto l_dev_type = ldev->getInfo<CL_DEVICE_TYPE>();
+    auto r_dev_type = rdev->getInfo<CL_DEVICE_TYPE>();
+
+    // This ensures GPU > ACCELERATOR > CPU
+    for (auto current_type : device_types) {
+        auto is_l_curr_type = l_dev_type == current_type;
+        auto is_r_curr_type = r_dev_type == current_type;
+
+        if ( is_l_curr_type && !is_r_curr_type) return true;
+        if (!is_l_curr_type &&  is_r_curr_type) return false;
+    }
+
+    // For GPUs, this ensures discreet > integrated
+    auto is_l_integrared = ldev->getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>();
+    auto is_r_integrared = rdev->getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>();
+
+    if (!is_l_integrared &&  is_r_integrared) return true;
+    if ( is_l_integrared && !is_r_integrared) return false;
+
+    // At this point, the devices are of same type.
+    // Sort based on emperical evidence of preferred platforms
+
+    // Prefer AMD first
+    std::string lPlatName = getPlatformName(*ldev);
+    std::string rPlatName = getPlatformName(*rdev);
+
+    if (l_dev_type == CL_DEVICE_TYPE_GPU &&
+        r_dev_type == CL_DEVICE_TYPE_GPU ) {
+        // If GPU, prefer AMD > NVIDIA > Beignet / Intel > APPLE
+        const char *platforms[] = {"AMD", "NVIDIA", "APPLE", "INTEL", "BEIGNET"};
+
+        for (auto ref_name : platforms) {
+            if ( verify_present(lPlatName, ref_name) &&
+                !verify_present(rPlatName, ref_name)) return true;
+
+            if (!verify_present(lPlatName, ref_name) &&
+                 verify_present(rPlatName, ref_name)) return false;
+        }
+
+        // Intel falls back to compare based on memory
+    } else {
+        // If CPU, prefer Intel > AMD > POCL > APPLE
+        const char *platforms[] = {"INTEL", "AMD", "POCL", "APPLE"};
+
+        for (auto ref_name : platforms) {
+            if ( verify_present(lPlatName, ref_name) &&
+                !verify_present(rPlatName, ref_name)) return true;
+
+            if (!verify_present(lPlatName, ref_name) &&
+                 verify_present(rPlatName, ref_name)) return false;
+        }
+    }
+
+
+    // Compare device compute versions
+
+    {
+        // Check Device OpenCL Version
+        auto lversion =  ldev->getInfo<CL_DEVICE_VERSION>();
+        auto rversion =  rdev->getInfo<CL_DEVICE_VERSION>();
+
+        bool lres = (lversion[7] > rversion[7]) ||
+            ((lversion[7] == rversion[7]) && (lversion[9] > rversion[9]));
+
+        bool rres = (lversion[7] < rversion[7]) ||
+            ((lversion[7] == rversion[7]) && (lversion[9] < rversion[9]));
+
+        if (lres) return true;
+        if (rres) return false;
+    }
+
+    // Default crietria, sort based on memory
+    // Sort based on memory
+    auto l_mem = ldev->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
+    auto r_mem = rdev->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
+    return l_mem >= r_mem;
+}
+
+static afcl::deviceType getDeviceTypeEnum(cl::Device dev)
+{
+    return (afcl::deviceType)dev.getInfo<CL_DEVICE_TYPE>();
+}
+
+
+static afcl::platform getPlatformEnum(cl::Device dev)
+{
+    std::string pname = getPlatformName(dev);
+    if (verify_present(pname, "AMD")) return AFCL_PLATFORM_AMD;
+    if (verify_present(pname, "NVIDIA")) return AFCL_PLATFORM_NVIDIA;
+    if (verify_present(pname, "INTEL")) return AFCL_PLATFORM_INTEL;
+    if (verify_present(pname, "APPLE")) return AFCL_PLATFORM_APPLE;
+    if (verify_present(pname, "BEIGNET")) return AFCL_PLATFORM_BEIGNET;
+    if (verify_present(pname, "POCL")) return AFCL_PLATFORM_POCL;
+    return AFCL_PLATFORM_UNKNOWN;
+}
+
+
 DeviceManager::DeviceManager()
-    : mActiveCtxId(0), mActiveQId(0)
+    : mUserDeviceOffset(0), mActiveCtxId(0), mActiveQId(0)
 {
     try {
         std::vector<cl::Platform>   platforms;
         Platform::get(&platforms);
 
-        cl_device_type DEVC_TYPES[] = {
-            CL_DEVICE_TYPE_GPU,
-#ifndef OS_MAC
-            CL_DEVICE_TYPE_ACCELERATOR,
-            CL_DEVICE_TYPE_CPU
+        // This is all we need because the sort takes care of the order of devices
+#ifdef OS_MAC
+        cl_device_type DEVICE_TYPES = CL_DEVICE_TYPE_GPU;
+#else
+        cl_device_type DEVICE_TYPES = CL_DEVICE_TYPE_ALL;
 #endif
-        };
-
-        for (auto &platform : platforms)
-            mPlatforms.push_back(new Platform(platform));
-
-        unsigned nDevices = 0;
-        for (auto devType : DEVC_TYPES) {
-            for (auto &platform : platforms) {
-
-                cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,
-                    (cl_context_properties)(platform()),
-                    0};
-
-                std::vector<Device> devs;
-                try {
-                    platform.getDevices(devType, &devs);
-                } catch(const cl::Error &err) {
-                    if (err.err() != CL_DEVICE_NOT_FOUND) {
-                        throw;
-                    }
-                }
 
-                for (auto dev : devs) {
-                    nDevices++;
-                    Context *ctx = new Context(dev, cps);
-                    CommandQueue *cq = new CommandQueue(*ctx, dev);
-                    mDevices.push_back(new Device(dev));
-                    mContexts.push_back(ctx);
-                    mQueues.push_back(cq);
-                    mCtxOffsets.push_back(nDevices);
-                    mIsGLSharingOn.push_back(false);
+        std::string deviceENV = getEnvVar("AF_OPENCL_DEVICE_TYPE");
+
+        if (deviceENV.compare("GPU") == 0) {
+            DEVICE_TYPES = CL_DEVICE_TYPE_GPU;
+        } else if (deviceENV.compare("CPU") == 0) {
+            DEVICE_TYPES = CL_DEVICE_TYPE_CPU;
+        } else if (deviceENV.compare("ACC") >= 0) {
+            DEVICE_TYPES = CL_DEVICE_TYPE_ACCELERATOR;
+        }
+
+
+
+        // Iterate through platforms, get all available devices and store them
+        for (auto &platform : platforms) {
+            std::vector<Device> current_devices;
+
+            try {
+                platform.getDevices(DEVICE_TYPES, &current_devices);
+            } catch(const cl::Error &err) {
+                if (err.err() != CL_DEVICE_NOT_FOUND) {
+                    throw;
                 }
             }
+
+            for (auto dev : current_devices) {
+                mDevices.push_back(new Device(dev));
+            }
+        }
+
+        int nDevices = mDevices.size();
+
+        if (nDevices == 0) AF_ERROR("No OpenCL devices found", AF_ERR_RUNTIME);
+
+        // Sort OpenCL devices based on default criteria
+        std::stable_sort(mDevices.begin(), mDevices.end(), compare_default);
+
+        // Create contexts and queues once the sort is done
+        for (int i = 0; i < nDevices; i++) {
+            cl_platform_id device_platform = mDevices[i]->getInfo<CL_DEVICE_PLATFORM>();
+            cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,
+                                            (cl_context_properties)(device_platform),
+                                            0};
+
+            Context *ctx = new Context(*mDevices[i], cps);
+            CommandQueue *cq = new CommandQueue(*ctx, *mDevices[i]);
+            mContexts.push_back(ctx);
+            mQueues.push_back(cq);
+            mIsGLSharingOn.push_back(false);
+            mDeviceTypes.push_back(getDeviceTypeEnum(*mDevices[i]));
+            mPlatforms.push_back(getPlatformEnum(*mDevices[i]));
         }
 
-        const char* deviceENV = getenv("AF_OPENCL_DEFAULT_DEVICE");
-        if(deviceENV) {
+        bool default_device_set = false;
+        deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE");
+        if(!deviceENV.empty()) {
             std::stringstream s(deviceENV);
             int def_device = -1;
             s >> def_device;
@@ -167,18 +295,48 @@ DeviceManager::DeviceManager()
                 printf("Setting default device as 0\n");
             } else {
                 setContext(def_device);
+                default_device_set = true;
             }
         }
+
+        deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE_TYPE");
+        if (!default_device_set && !deviceENV.empty())
+        {
+            cl_device_type default_device_type = CL_DEVICE_TYPE_GPU;
+            if (deviceENV.compare("CPU") == 0) {
+                default_device_type = CL_DEVICE_TYPE_CPU;
+            } else if (deviceENV.compare("ACC") >= 0) {
+                default_device_type = CL_DEVICE_TYPE_ACCELERATOR;
+            }
+
+            bool default_device_set = false;
+            for (int i = 0; i < nDevices; i++) {
+                if (mDevices[i]->getInfo<CL_DEVICE_TYPE>() == default_device_type) {
+                    default_device_set = true;
+                    setContext(i);
+                    break;
+                }
+            }
+
+            if (!default_device_set) {
+                printf("WARNING: AF_OPENCL_DEFAULT_DEVICE_TYPE=%s is not available\n",
+                       deviceENV.c_str());
+                printf("Using default device as 0\n");
+            }
+        }
+
     } catch (const cl::Error &error) {
             CL_TO_AF_ERROR(error);
     }
-    /* loop over devices and replace contexts with
-     * OpenGL shared contexts whereever applicable */
+
+
 #if defined(WITH_GRAPHICS)
     // Define AF_DISABLE_GRAPHICS with any value to disable initialization
-    const char* noGraphicsENV = getenv("AF_DISABLE_GRAPHICS");
-    if(!noGraphicsENV) { // If AF_DISABLE_GRAPHICS is not defined
+    std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS");
+    if(noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined
         try {
+            /* loop over devices and replace contexts with
+             * OpenGL shared contexts whereever applicable */
             int devCount = mDevices.size();
             fg::Window* wHandle = graphics::ForgeManager::getInstance().getMainWindow();
             for(int i=0; i<devCount; ++i)
@@ -187,6 +345,7 @@ DeviceManager::DeviceManager()
         }
     }
 #endif
+    mUserDeviceOffset = mDevices.size();
 }
 
 
@@ -206,11 +365,12 @@ static std::string platformMap(std::string &platStr)
     typedef std::map<std::string, std::string> strmap_t;
     static strmap_t platMap;
     if (isFirst) {
-        platMap["NVIDIA CUDA"] = "NVIDIA  ";
-        platMap["Intel(R) OpenCL"] = "INTEL   ";
+        platMap["NVIDIA CUDA"]                         = "NVIDIA  ";
+        platMap["Intel(R) OpenCL"]                     = "INTEL   ";
         platMap["AMD Accelerated Parallel Processing"] = "AMD     ";
-        platMap["Intel Gen OCL Driver"] = "BEIGNET ";
-        platMap["Apple"] = "APPLE   ";
+        platMap["Intel Gen OCL Driver"]                = "BEIGNET ";
+        platMap["Apple"]                               = "APPLE   ";
+        platMap["Portable Computing Language"]         = "POCL    ";
         isFirst = false;
     }
 
@@ -223,45 +383,48 @@ static std::string platformMap(std::string &platStr)
     }
 }
 
-std::string getInfo()
+std::string getDeviceInfo()
 {
     ostringstream info;
     info << "ArrayFire v" << AF_VERSION
          << " (OpenCL, " << get_system() << ", build " << AF_REVISION << ")" << std::endl;
 
     unsigned nDevices = 0;
-    for (auto context : DeviceManager::getInstance().mContexts) {
-        vector<Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
+    for(auto &device: DeviceManager::getInstance().mDevices) {
+        const Platform platform(device->getInfo<CL_DEVICE_PLATFORM>());
 
-        for(auto &device:devices) {
-            const Platform platform(device.getInfo<CL_DEVICE_PLATFORM>());
+        string dstr = device->getInfo<CL_DEVICE_NAME>();
 
-            string platStr = platform.getInfo<CL_PLATFORM_NAME>();
-            string dstr = device.getInfo<CL_DEVICE_NAME>();
+        // Remove null termination character from the strings
+        dstr.pop_back();
 
-            // Remove null termination character from the strings
-            platStr.pop_back();
-            dstr.pop_back();
+        bool show_braces = ((unsigned)getActiveDeviceId() == nDevices);
 
-            bool show_braces = ((unsigned)getActiveDeviceId() == nDevices);
-            string id = (show_braces ? string("[") : "-") + std::to_string(nDevices) +
-                        (show_braces ? string("]") : "-");
-            info << id << " " << platformMap(platStr) << ": " << ltrim(dstr) << " ";
+        string id =
+            (show_braces ? string("[") : "-") +
+            std::to_string(nDevices) +
+            (show_braces ? string("]") : "-");
+
+        size_t msize = device->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
+        info << id << " " << getPlatformName(*device) << ": " << ltrim(dstr)
+             << ", " << msize / 1048576 << " MB";
 #ifndef NDEBUG
-            string devVersion = device.getInfo<CL_DEVICE_VERSION>();
-            string driVersion = device.getInfo<CL_DRIVER_VERSION>();
-            devVersion.pop_back();
-            driVersion.pop_back();
-            info << devVersion;
-            info << " Device driver " << driVersion;
-            info << " FP64 Support("
-                 << (device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE>()>0 ? "True" : "False")
-                 << ")";
+        info << " -- ";
+        string devVersion = device->getInfo<CL_DEVICE_VERSION>();
+        string driVersion = device->getInfo<CL_DRIVER_VERSION>();
+        devVersion.pop_back();
+        driVersion.pop_back();
+        info << devVersion;
+        info << " -- Device driver " << driVersion;
+        info << " -- FP64 Support: "
+             << (device->getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE>()>0 ? "True" : "False");
+        info << " -- Unified Memory ("
+             << (isHostUnifiedMemory(*device) ? "True" : "False")
+             << ")";
 #endif
-            info << std::endl;
+        info << std::endl;
 
-            nDevices++;
-        }
+        nDevices++;
     }
     return info.str();
 }
@@ -270,6 +433,8 @@ std::string getPlatformName(const cl::Device &device)
 {
     const Platform platform(device.getInfo<CL_DEVICE_PLATFORM>());
     std::string platStr = platform.getInfo<CL_PLATFORM_NAME>();
+    // Remove null termination character from the strings
+    platStr.pop_back();
     return platformMap(platStr);
 }
 
@@ -295,6 +460,17 @@ int getDeviceIdFromNativeId(cl_device_id id)
     return devId;
 }
 
+int getActiveDeviceType()
+{
+    DeviceManager &instance = DeviceManager::getInstance();
+    return instance.mDeviceTypes[instance.mActiveQId];
+}
+
+int getActivePlatform()
+{
+    DeviceManager &instance = DeviceManager::getInstance();
+    return instance.mPlatforms[instance.mActiveQId];
+}
 const Context& getContext()
 {
     DeviceManager& devMngr = DeviceManager::getInstance();
@@ -307,10 +483,54 @@ CommandQueue& getQueue()
     return *(devMngr.mQueues[devMngr.mActiveQId]);
 }
 
-const cl::Device& getDevice()
+const cl::Device& getDevice(int id)
 {
     DeviceManager& devMngr = DeviceManager::getInstance();
-    return *(devMngr.mDevices[devMngr.mActiveQId]);
+    if(id == -1) id = devMngr.mActiveQId;
+    return *(devMngr.mDevices[id]);
+}
+
+size_t getDeviceMemorySize(int device)
+{
+    const cl::Device& dev = getDevice(device);
+    size_t msize = dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
+    return msize;
+}
+
+size_t getHostMemorySize()
+{
+    return common::getHostMemorySize();
+}
+
+cl_device_type getDeviceType()
+{
+    cl::Device device = getDevice();
+    cl_device_type type = device.getInfo<CL_DEVICE_TYPE>();
+    return type;
+}
+
+bool isHostUnifiedMemory(const cl::Device &device)
+{
+    return device.getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>();
+}
+
+bool OpenCLCPUOffload(bool forceOffloadOSX)
+{
+    static const bool offloadEnv = getEnvVar("AF_OPENCL_CPU_OFFLOAD") == "1";
+    bool offload = false;
+    if(offloadEnv) offload = isHostUnifiedMemory(getDevice());
+#if OS_MAC
+    // FORCED OFFLOAD FOR LAPACK FUNCTIONS ON OSX UNIFIED MEMORY DEVICES
+    //
+    // On OSX Unified Memory devices (Intel), always offload LAPACK but not GEMM
+    // irrespective of the AF_OPENCL_CPU_OFFLOAD value
+    // From GEMM, OpenCLCPUOffload(false) is called which will render the
+    // variable inconsequential to the returned result.
+    //
+    // Issue https://github.com/arrayfire/arrayfire/issues/662
+    offload = offload || forceOffloadOSX;
+#endif
+    return offload;
 }
 
 bool isGLSharingSupported()
@@ -478,10 +698,133 @@ void DeviceManager::markDeviceForInterop(const int device, const fg::Window* wHa
 }
 #endif
 
+void addDeviceContext(cl_device_id dev, cl_context ctx, cl_command_queue que)
+{
+    try {
+        DeviceManager& devMngr   = DeviceManager::getInstance();
+        cl::Device* tDevice      = new cl::Device(dev);
+        cl::Context* tContext    = new cl::Context(ctx);
+        cl::CommandQueue* tQueue = (que==NULL ?
+                new cl::CommandQueue(*tContext, *tDevice) : new cl::CommandQueue(que));
+        devMngr.mDevices.push_back(tDevice);
+        devMngr.mContexts.push_back(tContext);
+        devMngr.mQueues.push_back(tQueue);
+        devMngr.mPlatforms.push_back(getPlatformEnum(*tDevice));
+        // FIXME: add OpenGL Interop for user provided contexts later
+        devMngr.mIsGLSharingOn.push_back(false);
+    } catch (const cl::Error &ex) {
+        CL_TO_AF_ERROR(ex);
+    }
+}
+
+void setDeviceContext(cl_device_id dev, cl_context ctx)
+{
+    // FIXME: add OpenGL Interop for user provided contexts later
+    try {
+        DeviceManager& devMngr = DeviceManager::getInstance();
+        const int dCount = devMngr.mDevices.size();
+        for (int i=0; i<dCount; ++i) {
+            if(devMngr.mDevices[i]->operator()()==dev &&
+                    devMngr.mContexts[i]->operator()()==ctx) {
+                setDevice(i);
+                return;
+            }
+        }
+    } catch (const cl::Error &ex) {
+        CL_TO_AF_ERROR(ex);
+    }
+    AF_ERROR("No matching device found", AF_ERR_ARG);
+}
+
+void removeDeviceContext(cl_device_id dev, cl_context ctx)
+{
+    try {
+        if (getDevice()() == dev && getContext()()==ctx) {
+            AF_ERROR("Cannot pop the device currently in use", AF_ERR_ARG);
+        }
+
+        DeviceManager& devMngr = DeviceManager::getInstance();
+        const int dCount = devMngr.mDevices.size();
+        int deleteIdx = -1;
+        for (int i = 0; i<dCount; ++i) {
+            if(devMngr.mDevices[i]->operator()()==dev &&
+                    devMngr.mContexts[i]->operator()()==ctx) {
+                deleteIdx = i;
+                break;
+            }
+        }
+        if (deleteIdx < (int)devMngr.mUserDeviceOffset) {
+            AF_ERROR("Cannot pop ArrayFire internal devices", AF_ERR_ARG);
+        } else if (deleteIdx == -1) {
+            AF_ERROR("No matching device found", AF_ERR_ARG);
+        } else {
+            // FIXME: this case can potentially cause issues due to the
+            // modification of the device pool stl containers.
+
+            // IF the current active device is enumerated at a position
+            // that lies ahead of the device that has been requested
+            // to be removed. We just pop the entries from pool since it
+            // has no side effects.
+            devMngr.mDevices.erase(devMngr.mDevices.begin()+deleteIdx);
+            devMngr.mContexts.erase(devMngr.mContexts.begin()+deleteIdx);
+            devMngr.mQueues.erase(devMngr.mQueues.begin()+deleteIdx);
+            devMngr.mPlatforms.erase(devMngr.mPlatforms.begin()+deleteIdx);
+            // FIXME: add OpenGL Interop for user provided contexts later
+            devMngr.mIsGLSharingOn.erase(devMngr.mIsGLSharingOn.begin()+deleteIdx);
+            // OTHERWISE, update(decrement) the `mActive*Id` variables
+            if (deleteIdx < (int)devMngr.mActiveCtxId) {
+                --devMngr.mActiveCtxId;
+                --devMngr.mActiveQId;
+            }
+        }
+    } catch (const cl::Error &ex) {
+        CL_TO_AF_ERROR(ex);
+    }
+}
+
+bool synchronize_calls() {
+    static bool sync = getEnvVar("AF_SYNCHRONOUS_CALLS") == "1";
+    return sync;
+}
+
+
+unsigned getMaxJitSize()
+{
+    const int MAX_JIT_LEN = 20;
+    const int MAX_JIT_LEN_AMD = 16; //FIXME: Change this when bug is fixed
+
+    static int length = 0;
+    if (length == 0) {
+        std::string env_var = getEnvVar("AF_OPENCL_MAX_JIT_LEN");
+        if (!env_var.empty()) {
+            length = std::stoi(env_var);
+        } else {
+            length = MAX_JIT_LEN;
+        }
+    }
+
+    if (getActivePlatform() == AFCL_PLATFORM_AMD) {
+        return std::min(length, MAX_JIT_LEN_AMD);
+    }
+    return length;
+}
+
 }
 
 using namespace opencl;
 
+af_err afcl_get_device_type(afcl_device_type *res)
+{
+    *res = (afcl_device_type)getActiveDeviceType();
+    return AF_SUCCESS;
+}
+
+af_err afcl_get_platform(afcl_platform *res)
+{
+    *res = (afcl_platform)getActivePlatform();
+    return AF_SUCCESS;
+}
+
 af_err afcl_get_context(cl_context *ctx, const bool retain)
 {
     *ctx = getContext()();
@@ -508,3 +851,21 @@ af_err afcl_set_device_id(cl_device_id id)
     setDevice(getDeviceIdFromNativeId(id));
     return AF_SUCCESS;
 }
+
+af_err afcl_add_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que)
+{
+    addDeviceContext(dev, ctx, que);
+    return AF_SUCCESS;
+}
+
+af_err afcl_set_device_context(cl_device_id dev, cl_context ctx)
+{
+    setDeviceContext(dev, ctx);
+    return AF_SUCCESS;
+}
+
+af_err afcl_delete_device_context(cl_device_id dev, cl_context ctx)
+{
+    removeDeviceContext(dev, ctx);
+    return AF_SUCCESS;
+}
diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp
index 90f57aed39..42579f89d1 100644
--- a/src/backend/opencl/platform.hpp
+++ b/src/backend/opencl/platform.hpp
@@ -21,7 +21,7 @@ namespace opencl
 
 class DeviceManager
 {
-    friend std::string getInfo();
+    friend std::string getDeviceInfo();
 
     friend int getDeviceCount();
 
@@ -33,7 +33,9 @@ class DeviceManager
 
     friend cl::CommandQueue& getQueue();
 
-    friend const cl::Device& getDevice();
+    friend const cl::Device& getDevice(int id);
+
+    friend size_t getDeviceMemorySize(int device);
 
     friend bool isGLSharingSupported();
 
@@ -43,8 +45,17 @@ class DeviceManager
 
     friend int setDevice(int device);
 
+    friend void addDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que);
+
+    friend void setDeviceContext(cl_device_id dev, cl_context cxt);
+
+    friend void removeDeviceContext(cl_device_id dev, cl_context ctx);
+
+    friend int getActiveDeviceType();
+    friend int getActivePlatform();
+
     public:
-        static const unsigned MAX_DEVICES = 16;
+        static const unsigned MAX_DEVICES = 32;
 
         static DeviceManager& getInstance();
 
@@ -67,12 +78,13 @@ class DeviceManager
 
     private:
         // Attributes
-        std::vector<cl::CommandQueue*>  mQueues;
         std::vector<cl::Device*>       mDevices;
         std::vector<cl::Context*>     mContexts;
-        std::vector<cl::Platform*>   mPlatforms;
-        std::vector<unsigned>       mCtxOffsets;
+        std::vector<cl::CommandQueue*>  mQueues;
         std::vector<bool>        mIsGLSharingOn;
+        std::vector<int>         mDeviceTypes;
+        std::vector<int>         mPlatforms;
+        unsigned mUserDeviceOffset;
 
         unsigned mActiveCtxId;
         unsigned mActiveQId;
@@ -80,17 +92,29 @@ class DeviceManager
 
 int getBackend();
 
-std::string getInfo();
+std::string getDeviceInfo();
 
 int getDeviceCount();
 
 int getActiveDeviceId();
 
+unsigned getMaxJitSize();
+
 const cl::Context& getContext();
 
 cl::CommandQueue& getQueue();
 
-const cl::Device& getDevice();
+const cl::Device& getDevice(int id = -1);
+
+size_t getDeviceMemorySize(int device);
+
+size_t getHostMemorySize();
+
+cl_device_type getDeviceType();
+
+bool isHostUnifiedMemory(const cl::Device &device);
+
+bool OpenCLCPUOffload(bool forceOffloadOSX = true);
 
 bool isGLSharingSupported();
 
@@ -102,6 +126,17 @@ std::string getPlatformName(const cl::Device &device);
 
 int setDevice(int device);
 
+void addDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que);
+
+void setDeviceContext(cl_device_id dev, cl_context cxt);
+
+void removeDeviceContext(cl_device_id dev, cl_context ctx);
+
 void sync(int device);
 
+bool synchronize_calls();
+
+int getActiveDeviceType();
+int getActivePlatform();
+
 }
diff --git a/src/backend/opencl/program.hpp b/src/backend/opencl/program.hpp
index 1b76a75ce8..6a2af45131 100644
--- a/src/backend/opencl/program.hpp
+++ b/src/backend/opencl/program.hpp
@@ -9,6 +9,7 @@
 
 #pragma once
 #include <platform.hpp>
+#include <util.hpp>
 #include <string>
 #include <mutex>
 
@@ -35,8 +36,8 @@ using std::string;
 #if defined(NDEBUG)
 
 #define SHOW_BUILD_INFO(PROG) do {                                  \
-        const char *info = getenv("AF_OPENCL_SHOW_BUILD_INFO");     \
-        if (info != nullptr && std::strncmp(info,"0", 1) != 0) {    \
+        std::string info = getEnvVar("AF_OPENCL_SHOW_BUILD_INFO");  \
+        if (!info.empty() && info != "0") {                         \
             SHOW_DEBUG_BUILD_INFO(prog);                            \
         }                                                           \
     } while(0)
diff --git a/src/backend/opencl/qr.cpp b/src/backend/opencl/qr.cpp
index 9e30b43435..56101a8b97 100644
--- a/src/backend/opencl/qr.cpp
+++ b/src/backend/opencl/qr.cpp
@@ -9,16 +9,19 @@
 
 #include <qr.hpp>
 #include <err_common.hpp>
+#include <err_opencl.hpp>
 #include <blas.hpp>
 #include <copy.hpp>
-#include <identity.hpp>
-#include <err_opencl.hpp>
+
+#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+
 #include <magma/magma.h>
 #include <magma/magma_helper.h>
 #include <magma/magma_data.h>
 #include <kernel/triangle.hpp>
-
-#if defined(WITH_OPENCL_LINEAR_ALGEBRA)
+#include <platform.hpp>
+#include <identity.hpp>
+#include <cpu/cpu_qr.hpp>
 
 namespace opencl
 {
@@ -27,6 +30,10 @@ template<typename T>
 void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &orig)
 {
     try {
+        if(OpenCLCPUOffload()) {
+            return cpu::qr(q, r, t, orig);
+        }
+
         initBlas();
         dim4 iDims = orig.dims();
         int M = iDims[0];
@@ -81,6 +88,10 @@ template<typename T>
 Array<T> qr_inplace(Array<T> &in)
 {
     try {
+        if(OpenCLCPUOffload()) {
+            return cpu::qr_inplace(in);
+        }
+
         initBlas();
         dim4 iDims = in.dims();
         int M = iDims[0];
diff --git a/src/backend/opencl/set.cpp b/src/backend/opencl/set.cpp
index 5604ff4ad9..c37b7c4c4e 100644
--- a/src/backend/opencl/set.cpp
+++ b/src/backend/opencl/set.cpp
@@ -53,7 +53,7 @@ namespace opencl
             compute::buffer out_data((*out.get())());
 
             compute::buffer_iterator< type_t<T> > begin(out_data, 0);
-            compute::buffer_iterator< type_t<T> > end(out_data, out.dims()[0]);
+            compute::buffer_iterator< type_t<T> > end(out_data, out.elements());
 
             if (!is_sorted) {
                 compute::sort(begin, end, queue);
@@ -83,7 +83,7 @@ namespace opencl
                 unique_second = setUnique(second, false);
             }
 
-            size_t out_size = unique_first.dims()[0] + unique_second.dims()[0];
+            size_t out_size = unique_first.elements() + unique_second.elements();
             Array<T> out = createEmptyArray<T>(dim4(out_size, 1, 1, 1));
 
             compute::command_queue queue(getQueue()());
@@ -93,9 +93,9 @@ namespace opencl
             compute::buffer out_data((*out.get())());
 
             compute::buffer_iterator< type_t<T> > first_begin(first_data, 0);
-            compute::buffer_iterator< type_t<T> > first_end(first_data, unique_first.dims()[0]);
+            compute::buffer_iterator< type_t<T> > first_end(first_data, unique_first.elements());
             compute::buffer_iterator< type_t<T> > second_begin(second_data, 0);
-            compute::buffer_iterator< type_t<T> > second_end(second_data, unique_second.dims()[0]);
+            compute::buffer_iterator< type_t<T> > second_end(second_data, unique_second.elements());
             compute::buffer_iterator< type_t<T> > out_begin(out_data, 0);
 
             compute::buffer_iterator< type_t<T> > out_end = compute::set_union(
@@ -124,7 +124,7 @@ namespace opencl
                 unique_second = setUnique(second, false);
             }
 
-            size_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]);
+            size_t out_size = std::max(unique_first.elements(), unique_second.elements());
             Array<T> out = createEmptyArray<T>(dim4(out_size, 1, 1, 1));
 
             compute::command_queue queue(getQueue()());
@@ -134,9 +134,9 @@ namespace opencl
             compute::buffer out_data((*out.get())());
 
             compute::buffer_iterator< type_t<T> > first_begin(first_data, 0);
-            compute::buffer_iterator< type_t<T> > first_end(first_data, unique_first.dims()[0]);
+            compute::buffer_iterator< type_t<T> > first_end(first_data, unique_first.elements());
             compute::buffer_iterator< type_t<T> > second_begin(second_data, 0);
-            compute::buffer_iterator< type_t<T> > second_end(second_data, unique_second.dims()[0]);
+            compute::buffer_iterator< type_t<T> > second_end(second_data, unique_second.elements());
             compute::buffer_iterator< type_t<T> > out_begin(out_data, 0);
 
             compute::buffer_iterator< type_t<T> > out_end = compute::set_intersection(
diff --git a/src/backend/opencl/sift.cpp b/src/backend/opencl/sift.cpp
index 5bd940d127..632647ca19 100644
--- a/src/backend/opencl/sift.cpp
+++ b/src/backend/opencl/sift.cpp
@@ -15,7 +15,7 @@
 #include <err_opencl.hpp>
 #include <handle.hpp>
 
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
 #include <kernel/sift_nonfree.hpp>
 #endif
 
@@ -34,7 +34,7 @@ unsigned sift(Array<float>& x_out, Array<float>& y_out, Array<float>& score_out,
               const float img_scale, const float feature_ratio,
               const bool compute_GLOH)
 {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
     unsigned nfeat_out;
     unsigned desc_len;
 
diff --git a/src/backend/opencl/solve.cpp b/src/backend/opencl/solve.cpp
index 6d2bea4b4e..93176752b5 100644
--- a/src/backend/opencl/solve.cpp
+++ b/src/backend/opencl/solve.cpp
@@ -21,10 +21,14 @@
 #include <blas.hpp>
 #include <transpose.hpp>
 #include <math.hpp>
+#include <af/opencl.h>
 
 #include <algorithm>
 #include <string>
 
+#include <platform.hpp>
+#include <cpu/cpu_solve.hpp>
+
 namespace opencl
 {
 
@@ -32,6 +36,10 @@ template<typename T>
 Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
                  const Array<T> &b, const af_mat_prop options)
 {
+    if(OpenCLCPUOffload()) {
+        return cpu::solveLU(A, pivot, b, options);
+    }
+
     int N = A.dims()[0];
     int NRHS = b.dims()[1];
 
@@ -219,9 +227,7 @@ Array<T> leastSquares(const Array<T> &a, const Array<T> &b)
                               (*dT)(), tmp.getOffset() + NB * MN,
                               NB, 0, queue);
 
-
-        std::string pName = getPlatformName(getDevice());
-        if(pName.find("NVIDIA") != std::string::npos)
+        if(getActivePlatform() == AFCL_PLATFORM_NVIDIA)
         {
             Array<T> AT = transpose<T>(A, true);
             cl::Buffer* AT_buf = AT.get();
@@ -261,8 +267,7 @@ Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop o
     cl_event event = 0;
     cl_command_queue queue = getQueue()();
 
-    std::string pName = getPlatformName(getDevice());
-    if(pName.find("NVIDIA") != std::string::npos && (options & AF_MAT_UPPER))
+    if(getActivePlatform() == AFCL_PLATFORM_NVIDIA && (options & AF_MAT_UPPER))
     {
         Array<T> AT = transpose<T>(A, true);
 
@@ -296,6 +301,10 @@ template<typename T>
 Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
 {
     try {
+        if(OpenCLCPUOffload()) {
+            return cpu::solve(a, b, options);
+        }
+
         initBlas();
 
         if (options & AF_MAT_UPPER ||
diff --git a/src/backend/opencl/svd.cpp b/src/backend/opencl/svd.cpp
index 77f7c8aa37..61da27bdcd 100644
--- a/src/backend/opencl/svd.cpp
+++ b/src/backend/opencl/svd.cpp
@@ -20,6 +20,8 @@
 #include <magma/magma.h>
 #include <magma/magma_cpu_lapack.h>
 #include <magma/magma_helper.h>
+#include <platform.hpp>
+#include <cpu/cpu_svd.hpp>
 
 namespace opencl
 {
@@ -196,6 +198,10 @@ void svd(Array<T > &arrU,
 template<typename T, typename Tr>
 void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
 {
+    if(OpenCLCPUOffload()) {
+        return cpu::svdInPlace(s, u, vt, in);
+    }
+
     initBlas();
     svd<T, Tr>(u, s, vt, in, true);
 }
@@ -203,6 +209,10 @@ void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
 template<typename T, typename Tr>
 void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
 {
+    if(OpenCLCPUOffload()) {
+        return cpu::svd(s, u, vt, in);
+    }
+
     dim4 iDims = in.dims();
     int M = iDims[0];
     int N = iDims[1];
diff --git a/src/backend/opencl/transform.cpp b/src/backend/opencl/transform.cpp
index c8e2b69a8b..379fd2a5b7 100644
--- a/src/backend/opencl/transform.cpp
+++ b/src/backend/opencl/transform.cpp
@@ -18,46 +18,86 @@ namespace opencl
 {
     template<typename T>
     Array<T> transform(const Array<T> &in, const Array<float> &transform,
-                       const af::dim4 &odims,
-                       const af_interp_type method, const bool inverse)
+                       const af::dim4 &odims, const af_interp_type method,
+                       const bool inverse, const bool perspective)
     {
         Array<T> out = createEmptyArray<T>(odims);
 
         if(inverse) {
-            switch(method) {
-                case AF_INTERP_NEAREST:
-                    kernel::transform<T, true, AF_INTERP_NEAREST>
-                                     (out, in, transform);
-                    break;
-                case AF_INTERP_BILINEAR:
-                    kernel::transform<T, true, AF_INTERP_BILINEAR>
-                                     (out, in, transform);
-                    break;
-                case AF_INTERP_LOWER:
-                    kernel::transform<T, true, AF_INTERP_LOWER>
-                                     (out, in, transform);
-                    break;
-                default:
-                    AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-                    break;
+            if (perspective) {
+                switch(method) {
+                    case AF_INTERP_NEAREST:
+                        kernel::transform<T, true, true, AF_INTERP_NEAREST>
+                                         (out, in, transform);
+                        break;
+                    case AF_INTERP_BILINEAR:
+                        kernel::transform<T, true, true, AF_INTERP_BILINEAR>
+                                         (out, in, transform);
+                        break;
+                    case AF_INTERP_LOWER:
+                        kernel::transform<T, true, true, AF_INTERP_LOWER>
+                                         (out, in, transform);
+                        break;
+                    default:
+                        AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+                        break;
+                }
+            } else {
+                switch(method) {
+                    case AF_INTERP_NEAREST:
+                        kernel::transform<T, true, false, AF_INTERP_NEAREST>
+                                         (out, in, transform);
+                        break;
+                    case AF_INTERP_BILINEAR:
+                        kernel::transform<T, true, false, AF_INTERP_BILINEAR>
+                                         (out, in, transform);
+                        break;
+                    case AF_INTERP_LOWER:
+                        kernel::transform<T, true, false, AF_INTERP_LOWER>
+                                         (out, in, transform);
+                        break;
+                    default:
+                        AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+                        break;
+                }
             }
         } else {
-            switch(method) {
-                case AF_INTERP_NEAREST:
-                    kernel::transform<T, false, AF_INTERP_NEAREST>
-                                     (out, in, transform);
-                    break;
-                case AF_INTERP_BILINEAR:
-                    kernel::transform<T, false, AF_INTERP_BILINEAR>
-                                     (out, in, transform);
-                    break;
-                case AF_INTERP_LOWER:
-                    kernel::transform<T, false, AF_INTERP_LOWER>
-                                     (out, in, transform);
-                    break;
-                default:
-                    AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-                    break;
+            if (perspective) {
+                switch(method) {
+                    case AF_INTERP_NEAREST:
+                        kernel::transform<T, false, true, AF_INTERP_NEAREST>
+                                         (out, in, transform);
+                        break;
+                    case AF_INTERP_BILINEAR:
+                        kernel::transform<T, false, true, AF_INTERP_BILINEAR>
+                                         (out, in, transform);
+                        break;
+                    case AF_INTERP_LOWER:
+                        kernel::transform<T, false, true, AF_INTERP_LOWER>
+                                         (out, in, transform);
+                        break;
+                    default:
+                        AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+                        break;
+                }
+            } else {
+                switch(method) {
+                    case AF_INTERP_NEAREST:
+                        kernel::transform<T, false, false, AF_INTERP_NEAREST>
+                                         (out, in, transform);
+                        break;
+                    case AF_INTERP_BILINEAR:
+                        kernel::transform<T, false, false, AF_INTERP_BILINEAR>
+                                         (out, in, transform);
+                        break;
+                    case AF_INTERP_LOWER:
+                        kernel::transform<T, false, false, AF_INTERP_LOWER>
+                                         (out, in, transform);
+                        break;
+                    default:
+                        AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+                        break;
+                }
             }
         }
 
@@ -68,7 +108,7 @@ namespace opencl
 #define INSTANTIATE(T)                                                                  \
     template Array<T> transform(const Array<T> &in, const Array<float> &transform,      \
                                 const af::dim4 &odims, const af_interp_type method,     \
-                                const bool inverse);
+                                const bool inverse, const bool perspective);
 
     INSTANTIATE(float)
     INSTANTIATE(double)
diff --git a/src/backend/opencl/transform.hpp b/src/backend/opencl/transform.hpp
index f0b4d4c955..064817a537 100644
--- a/src/backend/opencl/transform.hpp
+++ b/src/backend/opencl/transform.hpp
@@ -14,5 +14,5 @@ namespace opencl
 {
     template<typename T>
     Array<T> transform(const Array<T> &in, const Array<float> &tf, const af::dim4 &odims,
-                        const af_interp_type method, const bool inverse);
+                        const af_interp_type method, const bool inverse, const bool perspective);
 }
diff --git a/src/backend/opencl/unary.hpp b/src/backend/opencl/unary.hpp
index 5a2cc9e33f..1e363d7dcb 100644
--- a/src/backend/opencl/unary.hpp
+++ b/src/backend/opencl/unary.hpp
@@ -16,7 +16,7 @@ namespace opencl
 {
 
 template<af_op_t op>
-static const char *unaryName() { return "noop"; }
+static const char *unaryName() { return "__noop"; }
 
 #define UNARY_DECL(OP, FNAME)                   \
     template<> STATIC_                          \
diff --git a/src/backend/util.cpp b/src/backend/util.cpp
new file mode 100644
index 0000000000..7c4cd2e614
--- /dev/null
+++ b/src/backend/util.cpp
@@ -0,0 +1,37 @@
+/*******************************************************
+ * Copyright (c) 2016, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+/// This file contains platform independent utility functions
+#include <string>
+#include <cstdlib>
+
+#if defined(OS_WIN)
+#include <Windows.h>
+#endif
+
+using std::string;
+
+string getEnvVar(const std::string &key)
+{
+#if defined(OS_WIN)
+    DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation
+    string retVal;
+    retVal.resize(bufSize);
+    bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize);
+    if (!bufSize) {
+        return string("");
+    } else {
+        retVal.resize(bufSize);
+        return retVal;
+    }
+#else
+    char * str = getenv(key.c_str());
+    return str==NULL ? string("") : string(str);
+#endif
+}
diff --git a/src/backend/util.hpp b/src/backend/util.hpp
new file mode 100644
index 0000000000..e1cd85a69c
--- /dev/null
+++ b/src/backend/util.hpp
@@ -0,0 +1,16 @@
+/*******************************************************
+ * Copyright (c) 2016, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+/// This file contains platform independent utility functions
+
+#include <string>
+
+#pragma once
+
+std::string getEnvVar(const std::string &key);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 44192eda3a..5db23714d3 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -6,6 +6,8 @@ SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 FIND_PACKAGE(CUDA QUIET)
 FIND_PACKAGE(OpenCL QUIET)
 
+OPTION(BUILD_SINGLE_TEST_FILE "Build tests in a single file" OFF)
+
 # If the tests are not being built at the same time as ArrayFire,
 # we need to first find the ArrayFire library
 IF(TARGET afcpu OR TARGET afcuda OR TARGET afopencl OR TARGET af)
@@ -18,10 +20,28 @@ ELSE()
     FIND_PACKAGE(ArrayFire REQUIRED)
     INCLUDE_DIRECTORIES(${ArrayFire_INCLUDE_DIRS})
     OPTION(BUILD_NONFREE "Build Tests for nonfree algorithms" OFF)
-    IF(${BUILD_NONFREE}) # Add definition. Not required when building with AF
-        ADD_DEFINITIONS(-DAF_BUILD_SIFT)
+
+    IF(${BUILD_NONFREE})
+        MESSAGE(WARNING "Building With NONFREE ON requires the following patents")
+        SET(BUILD_NONFREE_SIFT ON CACHE BOOL "Build ArrayFire with SIFT")
+    ELSE(${BUILD_NONFREE})
+        UNSET(BUILD_NONFREE_SIFT CACHE) # BUILD_NONFREE_SIFT cannot be built without BUILD_NONFREE
     ENDIF(${BUILD_NONFREE})
 
+    IF(${BUILD_NONFREE_SIFT})
+      ADD_DEFINITIONS(-DAF_BUILD_NONFREE_SIFT)
+
+      MESSAGE(WARNING "Building with SIFT requires the following patents")
+
+      MESSAGE("Method and apparatus for identifying scale invariant features"
+        "in an image and use of same for locating an object in an image,\" David"
+        "G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application"
+        "filed March 8, 1999. Asignee: The University of British Columbia. For"
+        "further details, contact David Lowe (lowe@cs.ubc.ca) or the"
+        "University-Industry Liaison Office of the University of British"
+        "Columbia.")
+    ENDIF(${BUILD_NONFREE_SIFT})
+
     # ENABLE_TESTING is required when building only tests
     # When building from source, enable_testing is picked from from the main
     # CMakeLists.txt
@@ -40,14 +60,36 @@ MACRO(CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS)
       SET(TEST_FILES ${FILES})
     ENDIF(${BACKEND} STREQUAL "unified")
 
-    FOREACH(FILE ${TEST_FILES})
+    IF (${BUILD_SINGLE_TEST_FILE})
+      SET(TEST_NAME test_${BACKEND})
+      SET(TEST_NAME_BASIC test_basic_${BACKEND})
+      ADD_EXECUTABLE(${TEST_NAME} ${CPP_FILES})
+      ADD_EXECUTABLE(${TEST_NAME_BASIC} basic_c.c)
+
+      TARGET_LINK_LIBRARIES(${TEST_NAME}  PRIVATE ${AFLIBNAME}
+        PRIVATE ${THREAD_LIB_FLAG}
+        PRIVATE ${GTEST_LIBS}
+        PRIVATE ${OTHER_LIBS})
+
+      TARGET_LINK_LIBRARIES(${TEST_NAME_BASIC}  PRIVATE ${AFLIBNAME}
+        PRIVATE ${THREAD_LIB_FLAG}
+        PRIVATE ${GTEST_LIBS}
+        PRIVATE ${OTHER_LIBS})
+
+      SET_TARGET_PROPERTIES(${TEST_NAME_BASIC}
+        PROPERTIES
+        COMPILE_FLAGS -DAF_${DEF_NAME}
+        FOLDER "Tests/${BACKEND}")
+
+    ELSE()
+      FOREACH(FILE ${TEST_FILES})
         GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE)
         SET(TEST_NAME ${FNAME}_${BACKEND})
 
         IF(NOT ${BUILD_NONFREE} AND "${FILE}" MATCHES ".nonfree.")
-            MESSAGE(STATUS "Removing ${FILE} from ctest")
+          MESSAGE(STATUS "Removing ${FILE} from ctest")
         ELSEIF("${FILE}" MATCHES ".manual.")
-            MESSAGE(STATUS "Removing ${FILE} from ctest")
+          MESSAGE(STATUS "Removing ${FILE} from ctest")
         ELSE()
           ADD_TEST(Test_${TEST_NAME} ${TEST_NAME})
         ENDIF()
@@ -55,18 +97,27 @@ MACRO(CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS)
         FILE(GLOB TEST_FILE "${FNAME}.cpp" "${FNAME}.c")
         ADD_EXECUTABLE(${TEST_NAME} ${TEST_FILE})
         TARGET_LINK_LIBRARIES(${TEST_NAME}  PRIVATE ${AFLIBNAME}
-                                            PRIVATE ${THREAD_LIB_FLAG}
-                                            PRIVATE ${GTEST_LIBS}
-                                            PRIVATE ${OTHER_LIBS})
+          PRIVATE ${THREAD_LIB_FLAG}
+          PRIVATE ${GTEST_LIBS}
+          PRIVATE ${OTHER_LIBS})
 
         SET_TARGET_PROPERTIES(${TEST_NAME}
-                      PROPERTIES
-                      COMPILE_FLAGS -DAF_${DEF_NAME}
-                      FOLDER "Tests/${BACKEND}")
-    ENDFOREACH()
+          PROPERTIES
+          COMPILE_FLAGS -DAF_${DEF_NAME}
+          FOLDER "Tests/${BACKEND}")
+      ENDFOREACH()
+    ENDIF()
 
 ENDMACRO(CREATE_TESTS)
 
+MACRO(CHECK_AND_CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS)
+    STRING(TOUPPER ${BACKEND} BACKEND_NAME_UPPER)
+    MESSAGE(STATUS "TESTS: ${BACKEND_NAME_UPPER} backend is ${BUILD_${BACKEND_NAME_UPPER}}.")
+    IF(${BUILD_${BACKEND_NAME_UPPER}})
+        CREATE_TESTS(${BACKEND} ${AFLIBNAME} "${GTEST_LIBS}" "${OTHER_LIBS}")
+    ENDIF()
+ENDMACRO(CHECK_AND_CREATE_TESTS)
+
 FIND_PACKAGE(Threads REQUIRED)
 IF(CMAKE_USE_PTHREADS_INIT AND NOT "${APPLE}")
     SET(THREAD_LIB_FLAG "-pthread")
@@ -118,19 +169,19 @@ INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS})
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 FILE(GLOB FILES "*.cpp" "*.c")
+FILE(GLOB CPP_FILES "*.cpp")
 LIST(SORT FILES)  # Tests execute in alphabetical order
 
-# We only build info.cpp and backend.cpp for Unified backend
-SET(UNIFIED_FILES "backend.cpp;info.cpp")
+# We only build backend.cpp for Unified backend
+SET(UNIFIED_FILES "backend.cpp;main.cpp")
 LIST(SORT UNIFIED_FILES)  # Tests execute in alphabetical order
 
 # Next we build each example using every backend.
 IF(${ArrayFire_CPU_FOUND})  # variable defined by FIND(ArrayFire ...)
-    MESSAGE(STATUS "TESTS: CPU backend is ON.")
-    CREATE_TESTS(cpu ${ArrayFire_CPU_LIBRARIES} "${GTEST_LIBRARIES}" "")
+    OPTION(BUILD_CPU "Build ArrayFire Tests for CPU backend" ON)
+    CHECK_AND_CREATE_TESTS(cpu ${ArrayFire_CPU_LIBRARIES} "${GTEST_LIBRARIES}" "")
 ELSEIF(TARGET afcpu)        # variable defined by the ArrayFire build tree
-    MESSAGE(STATUS "TESTS: CPU backend is ON.")
-    CREATE_TESTS(cpu afcpu "${GTEST_LIBRARIES}" "")
+    CHECK_AND_CREATE_TESTS(cpu afcpu "${GTEST_LIBRARIES}" "")
 ELSE()
     MESSAGE(STATUS "TESTS: CPU backend is OFF. afcpu was not found.")
 ENDIF()
@@ -144,10 +195,11 @@ IF (${CUDA_FOUND})
           PATHS ${CUDA_TOOLKIT_ROOT_DIR}
           DOC "CUDA NVVM Library"
           )
-        MESSAGE(STATUS "TESTS: CUDA backend is ON.")
+        MARK_AS_ADVANCED(CUDA_NVVM_LIBRARY)
         # If OSX && CLANG && CUDA < 7
         IF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-            CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
+            OPTION(BUILD_CUDA "Build ArrayFire Tests for CUDA backend" ON)
+            CHECK_AND_CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
 
             FOREACH(FILE ${FILES})
                 GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE)
@@ -158,15 +210,15 @@ IF (${CUDA_FOUND})
 
         # ELSE OSX && CLANG && CUDA < 7
         ELSE("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-            CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
+            OPTION(BUILD_CUDA "Build ArrayFire Tests for CUDA backend" ON)
+            CHECK_AND_CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
 
         ENDIF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
 
     ELSEIF(TARGET afcuda)        # variable defined by the ArrayFire build tree
-        MESSAGE(STATUS "TESTS: CUDA backend is ON.")
         # If OSX && CLANG && CUDA < 7
         IF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-            CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
+            CHECK_AND_CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
 
             FOREACH(FILE ${FILES})
                 GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE)
@@ -177,7 +229,7 @@ IF (${CUDA_FOUND})
 
         # ELSE OSX && CLANG && CUDA < 7
         ELSE("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-            CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
+            CHECK_AND_CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}")
 
         ENDIF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
     ELSE()
@@ -189,12 +241,13 @@ ENDIF()
 
 # OpenCL Backend
 IF (${OpenCL_FOUND})
+    INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIRS})
     IF(${ArrayFire_OpenCL_FOUND})  # variable defined by FIND(ArrayFire ...)
-        MESSAGE(STATUS "TESTS: OpenCL backend is ON.")
-        CREATE_TESTS(opencl ${ArrayFire_OpenCL_LIBRARIES} "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}")
+        OPTION(BUILD_OPENCL "Build ArrayFire Tests for OpenCL backend" ON)
+        MESSAGE(${OpenCL_LIBRARIES})
+        CHECK_AND_CREATE_TESTS(opencl ${ArrayFire_OpenCL_LIBRARIES} "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}")
     ELSEIF(TARGET afopencl)        # variable defined by the ArrayFire build tree
-        MESSAGE(STATUS "TESTS: OpenCL backend is ON.")
-        CREATE_TESTS(opencl afopencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}")
+        CHECK_AND_CREATE_TESTS(opencl afopencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}")
     ELSE()
         MESSAGE(STATUS "TESTS: OpenCL backend is OFF. afopencl was not found")
     ENDIF()
@@ -204,11 +257,10 @@ ENDIF()
 
 # Unified Backend
 IF(${ArrayFire_Unified_FOUND})  # variable defined by FIND(ArrayFire ...)
-    MESSAGE(STATUS "TESTS: UNIFIED backend is ON.")
-    CREATE_TESTS(unified ${ArrayFire_Unified_LIBRARIES} "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}")
+  OPTION(BUILD_UNIFIED "Build ArrayFire Tests for Unified backend" ON)
+    CHECK_AND_CREATE_TESTS(unified ${ArrayFire_Unified_LIBRARIES} "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}")
 ELSEIF(TARGET af)        # variable defined by the ArrayFire build tree
-    MESSAGE(STATUS "TESTS: UNIFIED backend is ON.")
-    CREATE_TESTS(unified af "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}")
+    CHECK_AND_CREATE_TESTS(unified af "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}")
 ELSE()
     MESSAGE(STATUS "TESTS: UNIFIED backend is OFF. af was not found.")
 ENDIF()
diff --git a/test/approx1.cpp b/test/approx1.cpp
index 7a6b66fce8..e7ea94e51e 100644
--- a/test/approx1.cpp
+++ b/test/approx1.cpp
@@ -23,6 +23,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/approx2.cpp b/test/approx2.cpp
index f1a1accc51..75a650631b 100644
--- a/test/approx2.cpp
+++ b/test/approx2.cpp
@@ -22,6 +22,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/array.cpp b/test/array.cpp
index 6c1f511410..293b888a8f 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -454,3 +454,34 @@ TEST(Device, unequal)
         ASSERT_EQ(ptr, b.device<float>());
     }
 }
+
+TEST(DeviceId, Same)
+{
+    array a = randu(5,5);
+    ASSERT_EQ(getDevice(), getDeviceId(a));
+}
+
+TEST(DeviceId, Different)
+{
+    int ndevices = getDeviceCount();
+    if (ndevices < 2) return;
+
+    int id0 = getDevice();
+    int id1 = (id0 + 1) % ndevices;
+
+    array a = randu(5,5);
+    ASSERT_EQ(getDeviceId(a), id0);
+    setDevice(id1);
+
+    array b = randu(5,5);
+
+    ASSERT_EQ(getDeviceId(a), id0);
+    ASSERT_EQ(getDeviceId(b), id1);
+    ASSERT_NE(getDevice(), getDeviceId(a));
+    ASSERT_EQ(getDevice(), getDeviceId(b));
+
+    af_array c;
+    af_err err = af_matmul(&c, a.get(), b.get(), AF_MAT_NONE, AF_MAT_NONE);
+    ASSERT_EQ(err, AF_ERR_DEVICE);
+    setDevice(id0);
+}
diff --git a/test/backend.cpp b/test/backend.cpp
index 59b8fd5129..78b64309db 100644
--- a/test/backend.cpp
+++ b/test/backend.cpp
@@ -21,14 +21,35 @@
 using std::string;
 using std::vector;
 
+const char *getActiveBackendString(af_backend active)
+{
+    switch(active) {
+        case AF_BACKEND_CPU   : return "AF_BACKEND_CPU";
+        case AF_BACKEND_CUDA  : return "AF_BACKEND_CUDA";
+        case AF_BACKEND_OPENCL: return "AF_BACKEND_OPENCL";
+        default               : return "AF_BACKEND_DEFAULT";
+    }
+}
+
 template<typename T>
 void testFunction()
 {
     af_info();
 
+    af_backend activeBackend = (af_backend)0;
+    af_get_active_backend(&activeBackend);
+
+    printf("Active Backend Enum = %s\n", getActiveBackendString(activeBackend));
+
     af_array outArray = 0;
     dim_t dims[] = {32, 32};
     ASSERT_EQ(AF_SUCCESS, af_randu(&outArray, 2, dims, (af_dtype) af::dtype_traits<T>::af_type));
+
+    // Verify backends returned by array and by function are the same
+    af_backend arrayBackend = (af_backend)0;
+    af_get_backend_id(&arrayBackend, outArray);
+    ASSERT_EQ(arrayBackend, activeBackend);
+
     // cleanup
     if(outArray != 0) ASSERT_EQ(AF_SUCCESS, af_release_array(outArray));
 }
@@ -37,10 +58,15 @@ void backendTest()
 {
     int backends = af::getAvailableBackends();
 
+    ASSERT_NE(backends, 0);
+
     bool cpu    = backends & AF_BACKEND_CPU;
     bool cuda   = backends & AF_BACKEND_CUDA;
     bool opencl = backends & AF_BACKEND_OPENCL;
 
+    printf("\nRunning Default Backend...\n");
+    testFunction<float>();
+
     if(cpu) {
         printf("\nRunning CPU Backend...\n");
         af::setBackend(AF_BACKEND_CPU);
diff --git a/test/basic_c.c b/test/basic_c.c
index f6c731092a..aac34e142d 100644
--- a/test/basic_c.c
+++ b/test/basic_c.c
@@ -9,9 +9,11 @@
 
 #include <arrayfire.h>
 
-int main() {
+int main()
+{
     af_array out = 0;
     dim_t s[] = {10, 10, 1, 1};
     af_err e = af_randu(&out, 4, s, f32);
+    if(out != 0) af_release_array(out);
     return (AF_SUCCESS != e);
 }
diff --git a/test/bilateral.cpp b/test/bilateral.cpp
index f0825e4893..cde330dca4 100644
--- a/test/bilateral.cpp
+++ b/test/bilateral.cpp
@@ -18,6 +18,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 template<typename T, bool isColor>
diff --git a/test/binary.cpp b/test/binary.cpp
index 477748792f..91ebcbc8b2 100644
--- a/test/binary.cpp
+++ b/test/binary.cpp
@@ -14,6 +14,7 @@
 #include <testHelpers.hpp>
 
 using namespace std;
+using std::abs;
 using namespace af;
 
 const int num = 10000;
diff --git a/test/cholesky_dense.cpp b/test/cholesky_dense.cpp
index 70548d898c..7fd238d215 100644
--- a/test/cholesky_dense.cpp
+++ b/test/cholesky_dense.cpp
@@ -22,6 +22,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/convolve.cpp b/test/convolve.cpp
index f3ff9fd6ef..fff5ebffea 100644
--- a/test/convolve.cpp
+++ b/test/convolve.cpp
@@ -17,6 +17,7 @@
 
 using std::vector;
 using std::string;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/data b/test/data
index db4f6e8062..cec85080f1 160000
--- a/test/data
+++ b/test/data
@@ -1 +1 @@
-Subproject commit db4f6e80629fb41580ab93208db6b8be958871df
+Subproject commit cec85080f12c25486d025d1fb1cf69e1beb03e58
diff --git a/test/diagonal.cpp b/test/diagonal.cpp
index c88f0fbeb1..c4becab2dc 100644
--- a/test/diagonal.cpp
+++ b/test/diagonal.cpp
@@ -14,6 +14,7 @@
 
 using namespace af;
 using std::vector;
+using std::abs;
 
 template<typename T>
 class Diagonal : public ::testing::Test
diff --git a/test/dot.cpp b/test/dot.cpp
index a25f59f27e..58cfbb2ed6 100644
--- a/test/dot.cpp
+++ b/test/dot.cpp
@@ -18,6 +18,7 @@
 
 using std::vector;
 using std::string;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/fast.cpp b/test/fast.cpp
index a114a8fdc6..8cb90574a6 100644
--- a/test/fast.cpp
+++ b/test/fast.cpp
@@ -20,6 +20,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 typedef struct
@@ -27,7 +28,7 @@ typedef struct
     float f[5];
 } feat_t;
 
-bool feat_cmp(feat_t i, feat_t j)
+static bool feat_cmp(feat_t i, feat_t j)
 {
     for (int k = 0; k < 5; k++)
         if (i.f[k] != j.f[k])
@@ -36,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j)
     return false;
 }
 
-void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat)
+static void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (unsigned i = 0; i < feat.size(); i++) {
diff --git a/test/fft.cpp b/test/fft.cpp
index 84f0e2382e..48ff865d2a 100644
--- a/test/fft.cpp
+++ b/test/fft.cpp
@@ -18,6 +18,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/fft_real.cpp b/test/fft_real.cpp
index c8d9a55ff0..8cd6612712 100644
--- a/test/fft_real.cpp
+++ b/test/fft_real.cpp
@@ -18,6 +18,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/fftconvolve.cpp b/test/fftconvolve.cpp
index cd82ab20d9..ec6a3f3279 100644
--- a/test/fftconvolve.cpp
+++ b/test/fftconvolve.cpp
@@ -17,6 +17,7 @@
 
 using std::vector;
 using std::string;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/getting_started.cpp b/test/getting_started.cpp
index 12d0b6b1de..9d77af2b30 100644
--- a/test/getting_started.cpp
+++ b/test/getting_started.cpp
@@ -15,6 +15,7 @@
 
 using namespace af;
 using std::vector;
+using std::abs;
 
 TEST(GettingStarted, SNIPPET_getting_started_gen)
 {
diff --git a/test/gloh_nonfree.cpp b/test/gloh_nonfree.cpp
index a65d52ad43..5794051152 100644
--- a/test/gloh_nonfree.cpp
+++ b/test/gloh_nonfree.cpp
@@ -20,6 +20,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 typedef struct
@@ -38,7 +39,8 @@ typedef struct
     float d[272];
 } desc_t;
 
-bool feat_cmp(feat_desc_t i, feat_desc_t j)
+#ifdef  AF_BUILD_NONFREE_SIFT
+static bool feat_cmp(feat_desc_t i, feat_desc_t j)
 {
     for (int k = 0; k < 5; k++)
         if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f))
@@ -47,7 +49,7 @@ bool feat_cmp(feat_desc_t i, feat_desc_t j)
     return true;
 }
 
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat)
+static void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (size_t i = 0; i < feat.size(); i++) {
@@ -61,7 +63,7 @@ void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* sc
     }
 }
 
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<float> >& desc, unsigned nfeat)
+static void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<float> >& desc, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (size_t i = 0; i < feat.size(); i++) {
@@ -75,7 +77,7 @@ void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* sc
     }
 }
 
-void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
+static void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (unsigned i = 0; i < feat.size(); i++) {
@@ -87,7 +89,7 @@ void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float
     }
 }
 
-void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
+static void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
 {
     f.resize(fd.size());
     d.resize(fd.size());
@@ -102,7 +104,7 @@ void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>&
     }
 }
 
-unsigned popcount(unsigned x)
+static unsigned popcount(unsigned x)
 {
     x = x - ((x >> 1) & 0x55555555);
     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
@@ -112,7 +114,7 @@ unsigned popcount(unsigned x)
     return x & 0x0000003F;
 }
 
-bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f)
+static bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f)
 {
     bool ret = true;
     float sum = 0.0f;
@@ -142,6 +144,7 @@ bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float
 
     return ret;
 }
+#endif
 
 template<typename T>
 class GLOH : public ::testing::Test
@@ -157,7 +160,7 @@ TYPED_TEST_CASE(GLOH, TestTypes);
 template<typename T>
 void glohTest(string pTestFile)
 {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
     if (noDoubleTests<T>()) return;
     if (noImageIOTests()) return;
 
@@ -269,7 +272,7 @@ void glohTest(string pTestFile)
 //
 TEST(GLOH, CPP)
 {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
     if (noDoubleTests<float>()) return;
     if (noImageIOTests()) return;
 
diff --git a/test/harris.cpp b/test/harris.cpp
index 276a3e357f..0adde6f95d 100644
--- a/test/harris.cpp
+++ b/test/harris.cpp
@@ -20,6 +20,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 typedef struct
@@ -27,7 +28,7 @@ typedef struct
     float f[5];
 } feat_t;
 
-bool feat_cmp(feat_t i, feat_t j)
+static bool feat_cmp(feat_t i, feat_t j)
 {
     for (int k = 0; k < 5; k++)
         if (i.f[k] != j.f[k])
@@ -36,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j)
     return false;
 }
 
-void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat)
+static void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (unsigned i = 0; i < feat.size(); i++) {
diff --git a/test/histogram.cpp b/test/histogram.cpp
index f1d7af51b9..c83ba0464f 100644
--- a/test/histogram.cpp
+++ b/test/histogram.cpp
@@ -18,6 +18,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 
 template<typename T>
 class Histogram : public ::testing::Test
diff --git a/test/homography.cpp b/test/homography.cpp
index 662b7a2a56..1bd24425be 100644
--- a/test/homography.cpp
+++ b/test/homography.cpp
@@ -20,6 +20,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 template<typename T>
diff --git a/test/imageio.cpp b/test/imageio.cpp
index d19aac346c..4029de5a1b 100644
--- a/test/imageio.cpp
+++ b/test/imageio.cpp
@@ -36,8 +36,6 @@ typedef ::testing::Types<float> TestTypes;
 // register the type list
 TYPED_TEST_CASE(ImageIO, TestTypes);
 
-// Disable tests if FreeImage is not found
-#if defined(WITH_FREEIMAGE)
 void loadImageTest(string pTestFile, string pImageFile, const bool isColor)
 {
     if (noDoubleTests<float>()) return;
@@ -251,4 +249,140 @@ TEST(ImageMem, SaveMemBMP)
     af::deleteImageMem(savedMem);
 }
 
-#endif // WITH_FREEIMAGE
+TEST(ImageIO, LoadImage16CPP)
+{
+    if (noImageIOTests()) return;
+
+    vector<af::dim4> numDims;
+
+    vector<vector<float> >   in;
+    vector<vector<float> >   tests;
+    readTests<float, float, float>(string(TEST_DIR"/imageio/color_seq_16.test"),numDims,in,tests);
+
+    af::dim4 dims = numDims[0];
+
+    af::array img = af::loadImage(string(TEST_DIR"/imageio/color_seq_16.png").c_str(), true);
+    ASSERT_EQ(img.type(), f32); // loadImage should always return float
+
+    // Get result
+    float *imgData = new float[dims.elements()];
+    img.host((void*)imgData);
+
+    // Compare result
+    size_t nElems = in[0].size();
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(in[0][elIter], imgData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Delete
+    delete[] imgData;
+}
+
+TEST(ImageIO, SaveImage16CPP)
+{
+    if (noImageIOTests()) return;
+
+    af::dim4 dims(16, 24, 3);
+
+    af::array input = af::randu(dims, u16);
+    af::array input_255 = (input / 257).as(u16);
+
+    af::saveImage("saveImage16CPP.png", input);
+
+    af::array img = af::loadImage("saveImage16CPP.png", true);
+    ASSERT_EQ(img.type(), f32); // loadImage should always return float
+
+    ASSERT_FALSE(af::anyTrue<bool>(abs(img - input_255)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Image IO Native Tests
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+void loadImageNativeCPPTest(string pTestFile, string pImageFile)
+{
+    if (noImageIOTests()) return;
+
+    vector<af::dim4> numDims;
+
+    vector<vector<float> >   in;
+    vector<vector<float> >   tests;
+    readTests<float, float, float>(pTestFile,numDims,in,tests);
+
+    af::dim4 dims = numDims[0];
+    af::array img = af::loadImageNative(pImageFile.c_str());
+    ASSERT_EQ(img.type(), (af_dtype)af::dtype_traits<T>::af_type);
+
+    // Get result
+    T *imgData = new T[dims.elements()];
+    img.host((void*)imgData);
+
+    // Compare result
+    size_t nElems = in[0].size();
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(in[0][elIter], imgData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Delete
+    delete[] imgData;
+}
+
+TEST(ImageIONative, LoadImageNative8CPP)
+{
+    loadImageNativeCPPTest<uchar>(string(TEST_DIR"/imageio/color_small.test"),
+                                  string(TEST_DIR"/imageio/color_small.png"));
+}
+
+TEST(ImageIONative, LoadImageNative16SmallCPP)
+{
+    loadImageNativeCPPTest<ushort>(string(TEST_DIR"/imageio/color_small_16.test"),
+                                   string(TEST_DIR"/imageio/color_small_16.png"));
+}
+
+TEST(ImageIONative, LoadImageNative16ColorCPP)
+{
+    loadImageNativeCPPTest<ushort>(string(TEST_DIR"/imageio/color_seq_16.test"),
+                                   string(TEST_DIR"/imageio/color_seq_16.png"));
+}
+
+TEST(ImageIONative, LoadImageNative16GrayCPP)
+{
+    loadImageNativeCPPTest<ushort>(string(TEST_DIR"/imageio/gray_seq_16.test"),
+                                   string(TEST_DIR"/imageio/gray_seq_16.png"));
+}
+
+template<typename T>
+void saveLoadImageNativeCPPTest(af::dim4 dims)
+{
+    if (noImageIOTests()) return;
+
+    af::array input = af::randu(dims, (af_dtype)af::dtype_traits<T>::af_type);
+
+    af::saveImageNative("saveImageNative.png", input);
+
+    af::array loaded = af::loadImageNative("saveImageNative.png");
+    ASSERT_EQ(loaded.type(), input.type());
+
+    ASSERT_FALSE(af::anyTrue<bool>(input - loaded));
+}
+
+TEST(ImageIONative, SaveLoadImageNative8CPP)
+{
+    saveLoadImageNativeCPPTest<uchar>(af::dim4(480, 720, 3, 1));
+}
+
+TEST(ImageIONative, SaveLoadImageNative16SmallCPP)
+{
+    saveLoadImageNativeCPPTest<ushort>(af::dim4(8, 12, 3, 1));
+}
+
+TEST(ImageIONative, SaveLoadImageNative16ColorCPP)
+{
+    saveLoadImageNativeCPPTest<ushort>(af::dim4(480, 720, 3, 1));
+}
+
+TEST(ImageIONative, SaveLoadImageNative16GrayCPP)
+{
+    saveLoadImageNativeCPPTest<ushort>(af::dim4(24, 32, 1, 1));
+}
diff --git a/test/internal.cpp b/test/internal.cpp
new file mode 100644
index 0000000000..75fa54fdb9
--- /dev/null
+++ b/test/internal.cpp
@@ -0,0 +1,124 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <af/internal.h>
+#include <string>
+#include <vector>
+#include <testHelpers.hpp>
+
+TEST(Internal, CreateStrided)
+{
+    float ha[] = {1,
+                  101, 102, 103, 104, 105,
+                  201, 202, 203, 204, 205,
+                  301, 302, 303, 304, 305,
+                  401, 402, 403, 404, 405,
+
+                  1010, 1020, 1030, 1040, 1050,
+                  2010, 2020, 2030, 2040, 2050,
+                  3010, 3020, 3030, 3040, 3050,
+                  4010, 4020, 4030, 4040, 4050};
+
+    dim_t offset = 1;
+    unsigned ndims = 3;
+    dim_t dims[] = {3, 3, 2};
+    dim_t strides[] = {1, 5, 20};
+    af::array a = createStridedArray((void *)ha,
+                                     offset,
+                                     af::dim4(ndims, dims),
+                                     af::dim4(ndims, strides),
+                                     f32,
+                                     afHost);
+
+    af::dim4 astrides = getStrides(a);
+    af::dim4 adims = a.dims();
+
+    ASSERT_EQ(offset, getOffset(a));
+    for (int i = 0; i < (int)ndims; i++) {
+        ASSERT_EQ(strides[i], astrides[i]);
+        ASSERT_EQ(dims[i], adims[i]);
+    }
+
+    std::vector<float> va(a.elements());
+    a.host(&va[0]);
+
+    int o = offset;
+    for (int k = 0; k < dims[2]; k++) {
+        for (int j = 0; j < dims[1]; j++) {
+            for (int i = 0; i < dims[0]; i++) {
+                ASSERT_EQ(va[i + j * dims[0] + k * dims[0] * dims[1]],
+                          ha[i * strides[0] + j * strides[1] + k * strides[2] + o])
+                    << "at ("
+                    << i << ","
+                    << j << ","
+                    << k << ")";
+            }
+        }
+    }
+}
+
+TEST(Internal, CheckInfo)
+{
+    int xdim = 10;
+    int ydim = 8;
+
+    int xoff = 1;
+    int yoff = 2;
+
+    int xnum = 5;
+    int ynum = 3;
+
+    af::array a = af::randu(10, 8);
+
+    af::array b = a(af::seq(xoff, xoff + xnum - 1),
+                    af::seq(yoff, yoff + ynum - 1));
+
+    af::dim4 strides = getStrides(b);
+    af::dim4 dims = b.dims();
+
+    dim_t offset = xoff + yoff * xdim;
+
+    ASSERT_EQ(dims[0], xnum);
+    ASSERT_EQ(dims[1], ynum);
+    ASSERT_EQ(isOwner(a), true);
+    ASSERT_EQ(isOwner(b), false);
+
+    ASSERT_EQ(getOffset(b), offset);
+    ASSERT_EQ(strides[0], 1);
+    ASSERT_EQ(strides[1], xdim);
+    ASSERT_EQ(strides[2], xdim * ydim);
+    ASSERT_EQ(getRawPtr(a), getRawPtr(b));
+}
+
+TEST(Internal, Linear)
+{
+    af::array c;
+    {
+        af::array a = af::randu(10, 8);
+
+        // b is just pointing to same underlying data
+        // b is an owner;
+        af::array b = a;
+        ASSERT_EQ(isOwner(b), true);
+
+        // C is considered sub array
+        // C will not be an owner
+        c = a(af::span);
+        ASSERT_EQ(isOwner(c), false);
+    }
+
+    // Even though a and b are out of scope, c is still not an owner
+    {
+        ASSERT_EQ(isOwner(c), false);
+    }
+}
diff --git a/test/inverse_dense.cpp b/test/inverse_dense.cpp
index b0568ebbdb..1b990b6900 100644
--- a/test/inverse_dense.cpp
+++ b/test/inverse_dense.cpp
@@ -22,6 +22,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/jit.cpp b/test/jit.cpp
index 3c2308d5eb..a20b0f4b19 100644
--- a/test/jit.cpp
+++ b/test/jit.cpp
@@ -65,3 +65,53 @@ TEST(JIT, CPP_JIT_HASH)
         delete[] hF2;
     }
 }
+
+TEST(JIT, CPP_JIT_Reset_Binary)
+{
+    using af::array;
+
+    af::array a = af::constant(2, 5,5);
+    af::array b = af::constant(1, 5,5);
+    af::array c = a + b;
+    af::array d = a - b;
+    af::array e = c * d;
+    e.eval();
+    af::array f = c - d;
+    f.eval();
+    af::array g = d - c;
+    g.eval();
+
+    std::vector<float> hf(f.elements());
+    std::vector<float> hg(g.elements());
+    f.host(&hf[0]);
+    g.host(&hg[0]);
+
+    for (int i = 0; i < (int)f.elements(); i++) {
+        ASSERT_EQ(hf[i], -hg[i]);
+    }
+}
+
+TEST(JIT, CPP_JIT_Reset_Unary)
+{
+    using af::array;
+
+    af::array a = af::constant(2, 5,5);
+    af::array b = af::constant(1, 5,5);
+    af::array c = af::sin(a);
+    af::array d = af::cos(b);
+    af::array e = c * d;
+    e.eval();
+    af::array f = c - d;
+    f.eval();
+    af::array g = d - c;
+    g.eval();
+
+    std::vector<float> hf(f.elements());
+    std::vector<float> hg(g.elements());
+    f.host(&hf[0]);
+    g.host(&hg[0]);
+
+    for (int i = 0; i < (int)f.elements(); i++) {
+        ASSERT_EQ(hf[i], -hg[i]);
+    }
+}
diff --git a/test/lu_dense.cpp b/test/lu_dense.cpp
index cdb23ef962..0783fb3425 100644
--- a/test/lu_dense.cpp
+++ b/test/lu_dense.cpp
@@ -22,6 +22,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/main.cpp b/test/main.cpp
new file mode 100644
index 0000000000..76f841f1b1
--- /dev/null
+++ b/test/main.cpp
@@ -0,0 +1,6 @@
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/math.cpp b/test/math.cpp
index 035ca257d2..e286e2a202 100644
--- a/test/math.cpp
+++ b/test/math.cpp
@@ -14,6 +14,7 @@
 
 using namespace std;
 using namespace af;
+using std::abs;
 
 const int num = 10000;
 const float flt_err = 1e-3;
diff --git a/test/meanshift.cpp b/test/meanshift.cpp
index 0116a5e3da..a35ca288d9 100644
--- a/test/meanshift.cpp
+++ b/test/meanshift.cpp
@@ -18,6 +18,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 template<typename T>
@@ -64,11 +65,12 @@ void meanshiftTest(string pTestFile)
 
     for (size_t testId=0; testId<testCount; ++testId) {
 
-        af_array inArray     = 0;
-        af_array inArray_f32 = 0;
-        af_array outArray    = 0;
-        af_array goldArray   = 0;
-        dim_t nElems      = 0;
+        af_array inArray        = 0;
+        af_array inArray_f32    = 0;
+        af_array outArray       = 0;
+        af_array goldArray      = 0;
+        af_array goldArray_f32  = 0;
+        dim_t nElems            = 0;
 
         inFiles[testId].insert(0,string(TEST_DIR"/meanshift/"));
         outFiles[testId].insert(0,string(TEST_DIR"/meanshift/"));
@@ -76,7 +78,8 @@ void meanshiftTest(string pTestFile)
         ASSERT_EQ(AF_SUCCESS, af_load_image(&inArray_f32, inFiles[testId].c_str(), isColor));
         ASSERT_EQ(AF_SUCCESS, conv_image<T>(&inArray, inArray_f32));
 
-        ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray, outFiles[testId].c_str(), isColor));
+        ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray_f32, outFiles[testId].c_str(), isColor));
+        ASSERT_EQ(AF_SUCCESS, conv_image<T>(&goldArray, goldArray_f32)); // af_load_image always returns float array
         ASSERT_EQ(AF_SUCCESS, af_get_elements(&nElems, goldArray));
 
         ASSERT_EQ(AF_SUCCESS, af_mean_shift(&outArray, inArray, 2.25f, 25.56f, 5, isColor));
@@ -93,6 +96,7 @@ void meanshiftTest(string pTestFile)
         ASSERT_EQ(AF_SUCCESS, af_release_array(inArray_f32));
         ASSERT_EQ(AF_SUCCESS, af_release_array(outArray));
         ASSERT_EQ(AF_SUCCESS, af_release_array(goldArray));
+        ASSERT_EQ(AF_SUCCESS, af_release_array(goldArray_f32));
     }
 }
 
diff --git a/test/medfilt.cpp b/test/medfilt.cpp
index 9b4590885b..2e3a1fcb6b 100644
--- a/test/medfilt.cpp
+++ b/test/medfilt.cpp
@@ -17,6 +17,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 
 template<typename T>
 class MedianFilter : public ::testing::Test
diff --git a/test/morph.cpp b/test/morph.cpp
index d9c5282146..c42ddf0cba 100644
--- a/test/morph.cpp
+++ b/test/morph.cpp
@@ -18,6 +18,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 
 template<typename T>
 class Morph : public ::testing::Test
diff --git a/test/ocl_ext_context.cpp b/test/ocl_ext_context.cpp
new file mode 100644
index 0000000000..e711c631e4
--- /dev/null
+++ b/test/ocl_ext_context.cpp
@@ -0,0 +1,131 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#if defined(AF_OPENCL)
+#include <af/opencl.h>
+#include <iostream>
+
+using namespace std;
+
+inline void checkErr(cl_int err, const char * name) {
+    if (err != CL_SUCCESS) {
+        std::cerr << "ERROR: " << name  << " (" << err << ")" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void getExternals(cl_device_id &deviceId, cl_context &context, cl_command_queue &queue)
+{
+    static cl_device_id dId = NULL;
+    static cl_context cId = NULL;
+    static cl_command_queue qId = NULL;
+    static bool call_once = true;
+
+    if (call_once) {
+        cl_platform_id platformId = NULL;
+        cl_uint numPlatforms;
+        cl_uint numDevices;
+        cl_int errorCode = 0;
+
+        checkErr(clGetPlatformIDs(1, &platformId, &numPlatforms),
+                "Get Platforms failed");
+
+        checkErr(clGetDeviceIDs(platformId, CL_DEVICE_TYPE_DEFAULT, 1, &dId, &numDevices),
+                "Get cl_device_id failed");
+
+        cId = clCreateContext(NULL, 1, &dId, NULL, NULL, &errorCode);
+        checkErr(errorCode, "Context creation failed");
+
+        qId = clCreateCommandQueue(cId, dId, 0, &errorCode);
+        checkErr(errorCode, "Command queue creation failed");
+        call_once = false;
+    }
+    deviceId = dId;
+    context  = cId;
+    queue    = qId;
+}
+
+TEST(OCLExtContext, push)
+{
+    cl_device_id deviceId = NULL;
+    cl_context context = NULL;
+    cl_command_queue queue = NULL;
+
+    getExternals(deviceId, context, queue);
+    int dCount = af::getDeviceCount();
+    printf("%d devices before afcl::addDevice\n", dCount);
+    af::info();
+    afcl::addDevice(deviceId, context, queue);
+    ASSERT_EQ(true, dCount+1==af::getDeviceCount());
+    printf("%d devices after afcl::addDevice\n", af::getDeviceCount());
+    af::info();
+}
+
+TEST(OCLExtContext, set)
+{
+    cl_device_id deviceId = NULL;
+    cl_context context = NULL;
+    cl_command_queue queue = NULL;
+
+    getExternals(deviceId, context, queue);
+    afcl::setDevice(deviceId, context);
+    af::info();
+
+    const int x = 5;
+    const int y = 5;
+    const int s = x * y;
+    af::array a = af::constant(1, x, y);
+    vector<float> host(s);
+    a.host((void*)host.data());
+    for (int i=0; i<s; ++i)
+        ASSERT_EQ(host[i], 1.0f);
+}
+
+TEST(OCLExtContext, pop)
+{
+    cl_device_id deviceId = NULL;
+    cl_context context = NULL;
+    cl_command_queue queue = NULL;
+
+    getExternals(deviceId, context, queue);
+    int dCount = af::getDeviceCount();
+    printf("%d devices before afcl::deleteDevice\n", dCount);
+    af::setDevice(0);
+    af::info();
+    afcl::deleteDevice(deviceId, context);
+    ASSERT_EQ(true, dCount-1==af::getDeviceCount());
+    printf("%d devices after afcl::deleteDevice\n", af::getDeviceCount());
+    af::info();
+}
+
+TEST(OCLCheck, DeviceType)
+{
+    afcl::deviceType devType = afcl::getDeviceType();
+    cl_device_type type = -100;
+    clGetDeviceInfo(afcl::getDeviceId(),
+                    CL_DEVICE_TYPE,
+                    sizeof(cl_device_type),
+                    &type,
+                    NULL);
+    ASSERT_EQ(type, (cl_device_type)devType);
+}
+
+TEST(OCLCheck, DevicePlatform)
+{
+    afcl::platform platform = afcl::getPlatform();
+    ASSERT_NE(platform, AFCL_PLATFORM_UNKNOWN);
+}
+
+#else
+TEST(OCLExtContext, NoopCPU)
+{
+}
+#endif
diff --git a/test/orb.cpp b/test/orb.cpp
index 5259366901..1266f20eb6 100644
--- a/test/orb.cpp
+++ b/test/orb.cpp
@@ -20,6 +20,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 typedef struct
@@ -38,7 +39,7 @@ typedef struct
     unsigned d[8];
 } desc_t;
 
-bool feat_cmp(feat_desc_t i, feat_desc_t j)
+static bool feat_cmp(feat_desc_t i, feat_desc_t j)
 {
     for (int k = 0; k < 5; k++)
         if (i.f[k] != j.f[k])
@@ -47,7 +48,7 @@ bool feat_cmp(feat_desc_t i, feat_desc_t j)
     return true;
 }
 
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, unsigned* desc, unsigned nfeat)
+static void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, unsigned* desc, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (size_t i = 0; i < feat.size(); i++) {
@@ -61,7 +62,7 @@ void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* sc
     }
 }
 
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<unsigned> >& desc, unsigned nfeat)
+static void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<unsigned> >& desc, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (size_t i = 0; i < feat.size(); i++) {
@@ -75,7 +76,7 @@ void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* sc
     }
 }
 
-void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
+static void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (unsigned i = 0; i < feat.size(); i++) {
@@ -87,7 +88,7 @@ void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float
     }
 }
 
-void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
+static void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
 {
     f.resize(fd.size());
     d.resize(fd.size());
@@ -102,7 +103,7 @@ void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>&
     }
 }
 
-unsigned popcount(unsigned x)
+static unsigned popcount(unsigned x)
 {
     x = x - ((x >> 1) & 0x55555555);
     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
diff --git a/test/qr_dense.cpp b/test/qr_dense.cpp
index 708eb5d0cd..e3809546b1 100644
--- a/test/qr_dense.cpp
+++ b/test/qr_dense.cpp
@@ -22,6 +22,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/random.cpp b/test/random.cpp
index 29f157a776..74f7e6541b 100644
--- a/test/random.cpp
+++ b/test/random.cpp
@@ -59,6 +59,7 @@ void randuTest(af::dim4 & dims)
 
     af_array outArray = 0;
     ASSERT_EQ(AF_SUCCESS, af_randu(&outArray, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits<T>::af_type));
+    ASSERT_EQ(af_sync(-1), AF_SUCCESS);
     if(outArray != 0) af_release_array(outArray);
 }
 
@@ -69,6 +70,7 @@ void randnTest(af::dim4 &dims)
 
     af_array outArray = 0;
     ASSERT_EQ(AF_SUCCESS, af_randn(&outArray, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits<T>::af_type));
+    ASSERT_EQ(af_sync(-1), AF_SUCCESS);
     if(outArray != 0) af_release_array(outArray);
 }
 
@@ -124,6 +126,7 @@ void randuArgsTest()
     dim_t dims[] = {1, 2, 3, 0};
     af_array outArray = 0;
     ASSERT_EQ(AF_ERR_SIZE, af_randu(&outArray, ndims, dims, (af_dtype) af::dtype_traits<char>::af_type));
+    ASSERT_EQ(af_sync(-1), AF_SUCCESS);
     if(outArray != 0) af_release_array(outArray);
 }
 
@@ -143,6 +146,7 @@ TEST(Random, CPP)
     af::dim4 dims(1, 2, 3, 1);
     af::array out1 = af::randu(dims);
     af::array out2 = af::randn(dims);
+    af::sync();
 }
 
 template<typename T>
diff --git a/test/rank_dense.cpp b/test/rank_dense.cpp
index d0a19af3b9..7f2e76db0d 100644
--- a/test/rank_dense.cpp
+++ b/test/rank_dense.cpp
@@ -22,6 +22,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/reduce.cpp b/test/reduce.cpp
index f71dc76b80..675ed8fc4a 100644
--- a/test/reduce.cpp
+++ b/test/reduce.cpp
@@ -109,16 +109,6 @@ void reduceTest(string pTestFile, int off = 0, bool isSubRef=false, const vector
     ASSERT_EQ(AF_SUCCESS, af_release_array(inArray));
 }
 
-vector<af_seq> init_subs()
-{
-    vector<af_seq> subs;
-    subs.push_back(af_make_seq(2, 6, 1));
-    subs.push_back(af_make_seq(1, 5, 1));
-    subs.push_back(af_make_seq(1, 3, 1));
-    subs.push_back(af_make_seq(1, 2, 1));
-    return subs;
-}
-
 template<typename T,reduceFunc OP>
 struct promote_type {
     typedef T type;
diff --git a/test/replace.cpp b/test/replace.cpp
index 9e99eaee8f..faa5636eb8 100644
--- a/test/replace.cpp
+++ b/test/replace.cpp
@@ -130,3 +130,46 @@ TEST(Replace, NaN)
         ASSERT_EQ(hc[i], std::isnan(ha[i]) ? b : ha[i]);
     }
 }
+
+TEST(Replace, ISSUE_1249)
+{
+    dim4 dims(2, 3, 4);
+    array cond = af::randu(dims) > 0.5;
+    array a = af::randu(dims);
+    array b = a.copy();
+    replace(b, !cond, a - a * 0.9);
+    array c = a - a * cond * 0.9;
+
+    int num = (int)dims.elements();
+    std::vector<float> hb(num);
+    std::vector<float> hc(num);
+
+    b.host(&hb[0]);
+    c.host(&hc[0]);
+
+    for (int i = 0; i < num; i++) {
+        ASSERT_EQ(hc[i], hb[i]) << "at " << i;
+    }
+}
+
+
+TEST(Replace, 4D)
+{
+    dim4 dims(2, 3, 4, 2);
+    array cond = af::randu(dims) > 0.5;
+    array a = af::randu(dims);
+    array b = a.copy();
+    replace(b, !cond, a - a * 0.9);
+    array c = a - a * cond * 0.9;
+
+    int num = (int)dims.elements();
+    std::vector<float> hb(num);
+    std::vector<float> hc(num);
+
+    b.host(&hb[0]);
+    c.host(&hc[0]);
+
+    for (int i = 0; i < num; i++) {
+        ASSERT_EQ(hc[i], hb[i]) << "at " << i;
+    }
+}
diff --git a/test/resize.cpp b/test/resize.cpp
index 6ec4e553c6..6c29e61cc6 100644
--- a/test/resize.cpp
+++ b/test/resize.cpp
@@ -20,6 +20,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
@@ -64,7 +65,7 @@ TYPED_TEST(Resize, InvalidDims)
 {
     if (noDoubleTests<TypeParam>()) return;
 
-    vector<TypeParam> in(8,8);
+    vector<TypeParam> in(8*8);
 
     af_array inArray  = 0;
     af_array outArray = 0;
diff --git a/test/rotate.cpp b/test/rotate.cpp
index f97cd3ab96..0d4b460033 100644
--- a/test/rotate.cpp
+++ b/test/rotate.cpp
@@ -20,6 +20,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/rotate_linear.cpp b/test/rotate_linear.cpp
index 29a9107e4c..15734a3cc2 100644
--- a/test/rotate_linear.cpp
+++ b/test/rotate_linear.cpp
@@ -20,11 +20,12 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
 template<typename T>
-class Rotate : public ::testing::Test
+class RotateLinear : public ::testing::Test
 {
     public:
         virtual void SetUp() {
@@ -39,7 +40,7 @@ class Rotate : public ::testing::Test
 typedef ::testing::Types<float, double, cfloat, cdouble, int, intl, char, short> TestTypes;
 
 // register the type list
-TYPED_TEST_CASE(Rotate, TestTypes);
+TYPED_TEST_CASE(RotateLinear, TestTypes);
 
 #define PI 3.1415926535897931f
 
@@ -107,10 +108,10 @@ void rotateTest(string pTestFile, const unsigned resultIdx, const float angle, c
     if(tempArray != 0) af_release_array(tempArray);
 }
 
-#define ROTATE_INIT(desc, file, resultIdx, angle, crop, recenter)                               \
-    TYPED_TEST(Rotate, desc)                                                                    \
-    {                                                                                           \
-        rotateTest<TypeParam>(string(TEST_DIR"/rotate/"#file".test"), resultIdx, angle, crop, recenter);\
+#define ROTATE_INIT(desc, file, resultIdx, angle, crop, recenter)       \
+    TYPED_TEST(RotateLinear, desc)                                      \
+    {                                                                   \
+        rotateTest<TypeParam>(string(TEST_DIR"/rotate/"#file".test"), resultIdx, angle, crop, recenter); \
     }
 
     ROTATE_INIT(Square180NoCropRecenter     , rotatelinear1,  0, 180, false, true);
@@ -165,7 +166,7 @@ void rotateTest(string pTestFile, const unsigned resultIdx, const float angle, c
 
 ////////////////////////////////// CPP //////////////////////////////////////
 
-TEST(Rotate, CPP)
+TEST(RotateLinear, CPP)
 {
     if (noDoubleTests<float>()) return;
 
diff --git a/test/scan.cpp b/test/scan.cpp
index 386568d402..34a077f122 100644
--- a/test/scan.cpp
+++ b/test/scan.cpp
@@ -82,16 +82,6 @@ void scanTest(string pTestFile, int off = 0, bool isSubRef=false, const vector<a
     ASSERT_EQ(AF_SUCCESS, af_release_array(inArray));
 }
 
-vector<af_seq> init_subs()
-{
-    vector<af_seq> subs;
-    subs.push_back(af_make_seq(2, 6, 1));
-    subs.push_back(af_make_seq(1, 5, 1));
-    subs.push_back(af_make_seq(1, 3, 1));
-    subs.push_back(af_make_seq(1, 2, 1));
-    return subs;
-}
-
 #define SCAN_TESTS(FN, TAG, Ti, To)             \
     TEST(Scan,Test_##FN##_##TAG)                \
     {                                           \
diff --git a/test/select.cpp b/test/select.cpp
index 1c39282b15..6e772ac7c4 100644
--- a/test/select.cpp
+++ b/test/select.cpp
@@ -136,3 +136,43 @@ TEST(Select, NaN)
         ASSERT_EQ(hc[i], std::isnan(ha[i]) ? b : ha[i]);
     }
 }
+
+TEST(Select, ISSUE_1249)
+{
+    dim4 dims(2, 3, 4);
+    array cond = af::randu(dims) > 0.5;
+    array a = af::randu(dims);
+    array b = select(cond, a - a * 0.9, a);
+    array c = a - a * cond * 0.9;
+
+    int num = (int)dims.elements();
+    std::vector<float> hb(num);
+    std::vector<float> hc(num);
+
+    b.host(&hb[0]);
+    c.host(&hc[0]);
+
+    for (int i = 0; i < num; i++) {
+        ASSERT_EQ(hc[i], hb[i]) << "at " << i;
+    }
+}
+
+TEST(Select, 4D)
+{
+    dim4 dims(2, 3, 4, 2);
+    array cond = af::randu(dims) > 0.5;
+    array a = af::randu(dims);
+    array b = select(cond, a - a * 0.9, a);
+    array c = a - a * cond * 0.9;
+
+    int num = (int)dims.elements();
+    std::vector<float> hb(num);
+    std::vector<float> hc(num);
+
+    b.host(&hb[0]);
+    c.host(&hc[0]);
+
+    for (int i = 0; i < num; i++) {
+        ASSERT_EQ(hc[i], hb[i]) << "at " << i;
+    }
+}
diff --git a/test/sift_nonfree.cpp b/test/sift_nonfree.cpp
index cf1683f775..6776c18a86 100644
--- a/test/sift_nonfree.cpp
+++ b/test/sift_nonfree.cpp
@@ -20,6 +20,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 typedef struct
@@ -37,8 +38,8 @@ typedef struct
 {
     float d[128];
 } desc_t;
-
-bool feat_cmp(feat_desc_t i, feat_desc_t j)
+#ifdef AF_BUILD_NONFREE_SIFT
+static bool feat_cmp(feat_desc_t i, feat_desc_t j)
 {
     for (int k = 0; k < 5; k++)
         if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f))
@@ -47,7 +48,7 @@ bool feat_cmp(feat_desc_t i, feat_desc_t j)
     return true;
 }
 
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat)
+static void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (size_t i = 0; i < feat.size(); i++) {
@@ -61,7 +62,7 @@ void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* sc
     }
 }
 
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<float> >& desc, unsigned nfeat)
+static void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<float> >& desc, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (size_t i = 0; i < feat.size(); i++) {
@@ -75,7 +76,7 @@ void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* sc
     }
 }
 
-void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
+static void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (unsigned i = 0; i < feat.size(); i++) {
@@ -87,7 +88,7 @@ void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float
     }
 }
 
-void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
+static void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
 {
     f.resize(fd.size());
     d.resize(fd.size());
@@ -102,7 +103,7 @@ void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>&
     }
 }
 
-unsigned popcount(unsigned x)
+static unsigned popcount(unsigned x)
 {
     x = x - ((x >> 1) & 0x55555555);
     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
@@ -112,7 +113,7 @@ unsigned popcount(unsigned x)
     return x & 0x0000003F;
 }
 
-bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f)
+static bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f)
 {
     bool ret = true;
     float sum = 0.0f;
@@ -142,6 +143,7 @@ bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float
 
     return ret;
 }
+#endif
 
 template<typename T>
 class SIFT : public ::testing::Test
@@ -157,7 +159,7 @@ TYPED_TEST_CASE(SIFT, TestTypes);
 template<typename T>
 void siftTest(string pTestFile, unsigned nLayers, float contrastThr, float edgeThr, float initSigma, bool doubleInput)
 {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
     if (noDoubleTests<T>()) return;
     if (noImageIOTests()) return;
 
@@ -275,7 +277,7 @@ void siftTest(string pTestFile, unsigned nLayers, float contrastThr, float edgeT
 //
 TEST(SIFT, CPP)
 {
-#ifdef AF_BUILD_SIFT
+#ifdef AF_BUILD_NONFREE_SIFT
     if (noDoubleTests<float>()) return;
     if (noImageIOTests()) return;
 
diff --git a/test/solve_dense.cpp b/test/solve_dense.cpp
index bbb67409dc..183afdbcc8 100644
--- a/test/solve_dense.cpp
+++ b/test/solve_dense.cpp
@@ -22,6 +22,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
@@ -185,15 +186,12 @@ SOLVE_TESTS(cdouble, 1E-5)
 #define SOLVE_TESTS(T, eps)                     \
     TEST(SOLVE, T##RectOver)                    \
     {                                           \
-        solveTester<T>(800, 600, 50, eps);      \
+        solveTester<T>(800, 600, 64, eps);      \
     }
 
 SOLVE_TESTS(float, 0.01)
 SOLVE_TESTS(double, 1E-5)
-// Fails on Windows on some devices
-#if !(defined(OS_WIN) && defined(AF_OPENCL))
 SOLVE_TESTS(cfloat, 0.01)
 SOLVE_TESTS(cdouble, 1E-5)
-#endif
 
 #undef SOLVE_TESTS
diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp
index 3d82b9fd90..ed827c9da5 100644
--- a/test/sort_by_key.cpp
+++ b/test/sort_by_key.cpp
@@ -26,7 +26,7 @@ using af::cfloat;
 using af::cdouble;
 
 template<typename T>
-class Sort : public ::testing::Test
+class SortByKey : public ::testing::Test
 {
     public:
         virtual void SetUp() {
@@ -41,7 +41,7 @@ class Sort : public ::testing::Test
 typedef ::testing::Types<float, double, uint, int, uchar, short, ushort, intl, uintl> TestTypes;
 
 // register the type list
-TYPED_TEST_CASE(Sort, TestTypes);
+TYPED_TEST_CASE(SortByKey, TestTypes);
 
 template<typename T>
 void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const unsigned resultIdx1, bool isSubRef = false, const vector<af_seq> * seqv = NULL)
@@ -104,10 +104,10 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     if(tempArray != 0) af_release_array(tempArray);
 }
 
-#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1)                                       \
-    TYPED_TEST(Sort, desc)                                                                       \
-    {                                                                                            \
-        sortTest<TypeParam>(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1);  \
+#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1)              \
+    TYPED_TEST(SortByKey, desc)                                         \
+    {                                                                   \
+        sortTest<TypeParam>(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \
     }
 
     SORT_INIT(Sort0True,      sort_by_key_tiny,  true,  0, 1);
@@ -116,9 +116,8 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     SORT_INIT(Sort10x10False, sort_by_key_2D,    false, 2, 3);
     SORT_INIT(Sort1000True,   sort_by_key_1000,  true,  0, 1);
     SORT_INIT(SortMedTrue,    sort_by_key_med,   true,  0, 1);
-    // FIXME: below two tests are disabled temporarily until issue#995 is fixed
-    //SORT_INIT(Sort1000False,  sort_by_key_1000,  false, 2, 3);
-    //SORT_INIT(SortMedFalse,   sort_by_key_med,   false, 2, 3);
+    SORT_INIT(Sort1000False,  sort_by_key_1000,  false, 2, 3);
+    SORT_INIT(SortMedFalse,   sort_by_key_med,   false, 2, 3);
     // Takes too much time in current implementation. Enable when everything is parallel
     //SORT_INIT(SortLargeTrue,  sort_by_key_large, true,  0, 1);
     //SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3);
@@ -169,4 +168,3 @@ TEST(SortByKey, CPP)
     delete[] keyData;
     delete[] valData;
 }
-
diff --git a/test/sort_index.cpp b/test/sort_index.cpp
index 0711e8b494..6aa240d5a5 100644
--- a/test/sort_index.cpp
+++ b/test/sort_index.cpp
@@ -26,7 +26,7 @@ using af::cfloat;
 using af::cdouble;
 
 template<typename T>
-class Sort : public ::testing::Test
+class SortIndex : public ::testing::Test
 {
     public:
         virtual void SetUp() {
@@ -41,7 +41,7 @@ class Sort : public ::testing::Test
 typedef ::testing::Types<float, double, uint, int, uchar, short, ushort, intl, uintl> TestTypes;
 
 // register the type list
-TYPED_TEST_CASE(Sort, TestTypes);
+TYPED_TEST_CASE(SortIndex, TestTypes);
 
 template<typename T>
 void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const unsigned resultIdx1, bool isSubRef = false, const vector<af_seq> * seqv = NULL)
@@ -102,10 +102,10 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     if(tempArray != 0) af_release_array(tempArray);
 }
 
-#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1)                                       \
-    TYPED_TEST(Sort, desc)                                                                       \
-    {                                                                                            \
-        sortTest<TypeParam>(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1);  \
+#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1)              \
+    TYPED_TEST(SortIndex, desc)                                         \
+    {                                                                   \
+        sortTest<TypeParam>(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \
     }
 
     SORT_INIT(Sort0True,  sort, true, 0, 1);
@@ -117,9 +117,8 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     SORT_INIT(Sort10x10False, sort_10x10, false, 2, 3);
     SORT_INIT(Sort1000True,   sort_1000,  true,  0, 1);
     SORT_INIT(SortMedTrue,    sort_med1,  true,  0, 1);
-    // FIXME: below two tests are disabled temporarily until issue#995 is fixed
-    //SORT_INIT(Sort1000False,  sort_1000,  false, 2, 3);
-    //SORT_INIT(SortMedFalse,   sort_med1,  false, 2, 3);
+    SORT_INIT(Sort1000False,  sort_1000,  false, 2, 3);
+    SORT_INIT(SortMedFalse,   sort_med1,  false, 2, 3);
     // Takes too much time in current implementation. Enable when everything is parallel
     //SORT_INIT(SortMed5True,   sort_med,   true,  0, 1);
     //SORT_INIT(SortMed5False,  sort_med,   false, 2, 3);
diff --git a/test/susan.cpp b/test/susan.cpp
index df806c06be..259a319ce7 100644
--- a/test/susan.cpp
+++ b/test/susan.cpp
@@ -20,6 +20,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::dim4;
 
 typedef struct
@@ -27,7 +28,7 @@ typedef struct
     float f[5];
 } feat_t;
 
-bool feat_cmp(feat_t i, feat_t j)
+static bool feat_cmp(feat_t i, feat_t j)
 {
     for (int k = 0; k < 5; k++)
         if (i.f[k] != j.f[k])
@@ -36,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j)
     return false;
 }
 
-void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat)
+static void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat)
 {
     feat.resize(nfeat);
     for (unsigned i = 0; i < feat.size(); i++) {
diff --git a/test/svd_dense.cpp b/test/svd_dense.cpp
index f7ef2950e0..7ce31e2ee5 100644
--- a/test/svd_dense.cpp
+++ b/test/svd_dense.cpp
@@ -22,6 +22,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
@@ -34,12 +35,12 @@ typedef ::testing::Types<float, double, cfloat, cdouble> TestTypes;
 TYPED_TEST_CASE(svd, TestTypes);
 
 template<typename T>
-double get_val(T val)
+inline double get_val(T val)
 {
     return val;
 }
 
-template<> double get_val<cfloat>(cfloat val)
+template<> inline double get_val<cfloat>(cfloat val)
 {
     return abs(val);
 }
diff --git a/test/testHelpers.hpp b/test/testHelpers.hpp
index 758bf98e14..83f2552e08 100644
--- a/test/testHelpers.hpp
+++ b/test/testHelpers.hpp
@@ -6,6 +6,8 @@
  * The complete license agreement can be obtained at:
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
 
 #include <string>
 #include <fstream>
@@ -127,11 +129,11 @@ void readTestsFromFile(const std::string &FileName, std::vector<af::dim4> &input
     }
 }
 
-void readImageTests(const std::string        &pFileName,
-                    std::vector<af::dim4>    &pInputDims,
-                    std::vector<std::string> &pTestInputs,
-                    std::vector<dim_t>    &pTestOutSizes,
-                    std::vector<std::string> &pTestOutputs)
+inline void readImageTests(const std::string        &pFileName,
+                           std::vector<af::dim4>    &pInputDims,
+                           std::vector<std::string> &pTestInputs,
+                           std::vector<dim_t>    &pTestOutSizes,
+                           std::vector<std::string> &pTestOutputs)
 {
     using std::vector;
 
@@ -364,18 +366,18 @@ struct cond_type<false, T, Other> {
 };
 
 template<typename T>
-double real(T val) { return (double)val; }
+inline double real(T val) { return (double)val; }
 template<>
-double real<af::cdouble>(af::cdouble val) { return real(val); }
+inline double real<af::cdouble>(af::cdouble val) { return real(val); }
 template<>
-double real<af::cfloat> (af::cfloat val) { return real(val); }
+inline double real<af::cfloat> (af::cfloat val) { return real(val); }
 
 template<typename T>
-double imag(T val) { return (double)val; }
+inline double imag(T val) { return (double)val; }
 template<>
-double imag<af::cdouble>(af::cdouble val) { return imag(val); }
+inline double imag<af::cdouble>(af::cdouble val) { return imag(val); }
 template<>
-double imag<af::cfloat> (af::cfloat val) { return imag(val); }
+inline double imag<af::cfloat> (af::cfloat val) { return imag(val); }
 
 template<typename T>
 bool noDoubleTests()
@@ -388,37 +390,18 @@ bool noDoubleTests()
     return ((isTypeDouble && !isDoubleSupported) ? true : false);
 }
 
-bool noImageIOTests()
+inline bool noImageIOTests()
 {
-    af_array arr = 0;
-    const af_err err = af_load_image(&arr, TEST_DIR"/imageio/color_small.png", true);
-
-    if(arr != 0) af_release_array(arr);
-
-    if(err == AF_ERR_NOT_CONFIGURED)
-        return true;    // Yes, disable test
-    else
-        return false;   // No, let test continue
+    bool ret = !af::isImageIOAvailable();
+    if(ret) printf("Image IO Not Configured. Test will exit\n");
+    return ret;
 }
 
-bool noLAPACKTests()
+inline bool noLAPACKTests()
 {
-    // Run LU
-    af::dim4 dims(5, 5);
-    af_array in = 0, l = 0, u = 0, p= 0;
-    af_randu(&in, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits<float>::af_type);
-
-    af_err err = af_lu(&l, &u, &p, in);
-
-    if(in != 0) af_release_array(in);
-    if(l  != 0) af_release_array(l);
-    if(u  != 0) af_release_array(u);
-    if(p  != 0) af_release_array(p);
-
-    if(err == AF_ERR_NOT_CONFIGURED)
-        return true;    // Yes, disable test
-    else
-        return false;   // No, let test continue
+    bool ret = !af::isLAPACKAvailable();
+    if(ret) printf("LAPACK Not Configured. Test will exit\n");
+    return ret;
 }
 
 // TODO: perform conversion on device for CUDA and OpenCL
@@ -469,3 +452,5 @@ af::array cpu_randu(const af::dim4 dims)
 
     return af::array(dims, (T *)&out[0]);
 }
+
+#pragma GCC diagnostic pop
diff --git a/test/transform.cpp b/test/transform.cpp
new file mode 100644
index 0000000000..1950284c2d
--- /dev/null
+++ b/test/transform.cpp
@@ -0,0 +1,268 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <vector>
+#include <iostream>
+#include <string>
+#include <testHelpers.hpp>
+
+using std::vector;
+using std::string;
+using std::abs;
+using std::cout;
+using std::endl;
+
+template<typename T>
+class Transform : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {}
+};
+
+template<typename T>
+class TransformInt : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {
+        }
+};
+
+typedef ::testing::Types<float, double> TestTypes;
+typedef ::testing::Types<int, intl, uint, uintl, short, ushort, uchar> TestTypesInt;
+
+TYPED_TEST_CASE(Transform, TestTypes);
+TYPED_TEST_CASE(TransformInt, TestTypesInt);
+
+template<typename T>
+void transformTest(string pTestFile, string pHomographyFile, const af_interp_type method, const bool invert)
+{
+    if (noDoubleTests<T>()) return;
+
+    vector<af::dim4> inNumDims;
+    vector<string>   inFiles;
+    vector<dim_t>    goldNumDims;
+    vector<string>   goldFiles;
+
+    readImageTests(pTestFile, inNumDims, inFiles, goldNumDims, goldFiles);
+
+    inFiles[0].insert(0,string(TEST_DIR"/transform/"));
+    inFiles[1].insert(0,string(TEST_DIR"/transform/"));
+    goldFiles[0].insert(0,string(TEST_DIR"/transform/"));
+
+    af::dim4 objDims = inNumDims[0];
+
+    vector<af::dim4>       HNumDims;
+    vector<vector<float> > HIn;
+    vector<vector<float> > HTests;
+    readTests<float, float, float>(pHomographyFile, HNumDims, HIn, HTests);
+
+    af::dim4 HDims = HNumDims[0];
+
+    af_array sceneArray_f32 = 0;
+    af_array goldArray_f32 = 0;
+    af_array outArray_f32 = 0;
+    af_array sceneArray = 0;
+    af_array goldArray = 0;
+    af_array outArray = 0;
+    af_array HArray = 0;
+
+    ASSERT_EQ(AF_SUCCESS, af_load_image(&sceneArray_f32, inFiles[1].c_str(), false));
+    ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray_f32, goldFiles[0].c_str(), false));
+
+    ASSERT_EQ(AF_SUCCESS, conv_image<T>(&sceneArray, sceneArray_f32));
+    ASSERT_EQ(AF_SUCCESS, conv_image<T>(&goldArray, goldArray_f32));
+
+    ASSERT_EQ(AF_SUCCESS, af_create_array(&HArray, &(HIn[0].front()), HDims.ndims(), HDims.get(), f32));
+
+    ASSERT_EQ(AF_SUCCESS, af_transform(&outArray, sceneArray, HArray, objDims[0], objDims[1], method, invert));
+
+    // Get gold data
+    dim_t goldEl = 0;
+    ASSERT_EQ(AF_SUCCESS, af_get_elements(&goldEl, goldArray));
+    T* goldData = new T[goldEl];
+    ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)goldData, goldArray));
+
+    // Get result
+    dim_t outEl = 0;
+    ASSERT_EQ(AF_SUCCESS, af_get_elements(&outEl, outArray));
+    T* outData = new T[outEl];
+    ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray));
+
+    const float thr = 1.1f;
+
+    // Maximum number of wrong pixels must be <= 0.01% of number of elements,
+    // this metric is necessary due to rounding errors between different
+    // backends for AF_INTERP_NEAREST and AF_INTERP_LOWER
+    const size_t maxErr = goldEl * 0.0001f;
+    size_t err = 0;
+
+    for (dim_t elIter = 0; elIter < goldEl; elIter++) {
+        err += fabs((float)floor(outData[elIter]) - (float)floor(goldData[elIter])) > thr;
+        if (err > maxErr)
+            ASSERT_LE(err, maxErr) << "at: " << elIter << std::endl;
+    }
+
+    delete[] goldData;
+    delete[] outData;
+
+    if(sceneArray_f32 != 0) af_release_array(sceneArray_f32);
+    if(goldArray_f32  != 0) af_release_array(goldArray_f32);
+    if(outArray_f32   != 0) af_release_array(outArray_f32);
+    if(sceneArray     != 0) af_release_array(sceneArray);
+    if(goldArray      != 0) af_release_array(goldArray);
+    if(outArray       != 0) af_release_array(outArray);
+    if(HArray         != 0) af_release_array(HArray);
+}
+
+TYPED_TEST(Transform, PerspectiveNearest)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_nearest.test"),
+                             string(TEST_DIR"/transform/tux_tmat.test"),
+                             AF_INTERP_NEAREST, false);
+}
+
+TYPED_TEST(Transform, PerspectiveBilinear)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_bilinear.test"),
+                             string(TEST_DIR"/transform/tux_tmat.test"),
+                             AF_INTERP_BILINEAR, false);
+}
+
+TYPED_TEST(Transform, PerspectiveLower)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_lower.test"),
+                             string(TEST_DIR"/transform/tux_tmat.test"),
+                             AF_INTERP_LOWER, false);
+}
+
+TYPED_TEST(Transform, PerspectiveNearestInvert)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_nearest.test"),
+                             string(TEST_DIR"/transform/tux_tmat_inverse.test"),
+                             AF_INTERP_NEAREST, true);
+}
+
+TYPED_TEST(Transform, PerspectiveBilinearInvert)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_bilinear.test"),
+                             string(TEST_DIR"/transform/tux_tmat_inverse.test"),
+                             AF_INTERP_BILINEAR, true);
+}
+
+TYPED_TEST(Transform, PerspectiveLowerInvert)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_lower.test"),
+                             string(TEST_DIR"/transform/tux_tmat_inverse.test"),
+                             AF_INTERP_LOWER, true);
+}
+
+TYPED_TEST(TransformInt, PerspectiveNearest)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_nearest.test"),
+                             string(TEST_DIR"/transform/tux_tmat.test"),
+                             AF_INTERP_NEAREST, false);
+}
+
+TYPED_TEST(TransformInt, PerspectiveBilinear)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_bilinear.test"),
+                             string(TEST_DIR"/transform/tux_tmat.test"),
+                             AF_INTERP_BILINEAR, false);
+}
+
+TYPED_TEST(TransformInt, PerspectiveLower)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_lower.test"),
+                             string(TEST_DIR"/transform/tux_tmat.test"),
+                             AF_INTERP_LOWER, false);
+}
+
+TYPED_TEST(TransformInt, PerspectiveNearestInvert)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_nearest.test"),
+                             string(TEST_DIR"/transform/tux_tmat_inverse.test"),
+                             AF_INTERP_NEAREST, true);
+}
+
+TYPED_TEST(TransformInt, PerspectiveBilinearInvert)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_bilinear.test"),
+                             string(TEST_DIR"/transform/tux_tmat_inverse.test"),
+                             AF_INTERP_BILINEAR, true);
+}
+
+TYPED_TEST(TransformInt, PerspectiveLowerInvert)
+{
+    transformTest<TypeParam>(string(TEST_DIR"/transform/tux_lower.test"),
+                             string(TEST_DIR"/transform/tux_tmat_inverse.test"),
+                             AF_INTERP_LOWER, true);
+}
+
+
+///////////////////////////////////// CPP ////////////////////////////////
+//
+TEST(Transform, CPP)
+{
+    vector<af::dim4>   inDims;
+    vector<string> inFiles;
+    vector<dim_t>  goldDim;
+    vector<string> goldFiles;
+
+    vector<af::dim4> HDims;
+    vector<vector<float> >   HIn;
+    vector<vector<float> >   HTests;
+    readTests<float, float, float>(TEST_DIR"/transform/tux_tmat.test",HDims,HIn,HTests);
+
+    readImageTests(string(TEST_DIR"/transform/tux_nearest.test"), inDims, inFiles, goldDim, goldFiles);
+
+    inFiles[0].insert(0,string(TEST_DIR"/transform/"));
+    inFiles[1].insert(0,string(TEST_DIR"/transform/"));
+
+    goldFiles[0].insert(0,string(TEST_DIR"/transform/"));
+
+    af::array H = af::array(HDims[0][0], HDims[0][1], &(HIn[0].front()));
+    af::array IH = af::array(HDims[0][0], HDims[0][1], &(HIn[0].front()));
+
+    af::array scene_img = af::loadImage(inFiles[1].c_str(), false);
+
+    af::array gold_img = af::loadImage(goldFiles[0].c_str(), false);
+
+    af::array out_img = af::transform(scene_img, IH, inDims[0][0], inDims[0][1], AF_INTERP_NEAREST, false);
+
+    af::dim4 outDims = out_img.dims();
+    af::dim4 goldDims = gold_img.dims();
+
+    float* h_out_img = new float[outDims[0] * outDims[1]];
+    out_img.host(h_out_img);
+    float* h_gold_img = new float[goldDims[0] * goldDims[1]];
+    gold_img.host(h_gold_img);
+
+    const dim_t n = gold_img.elements();
+
+    const float thr = 1.0f;
+
+    // Maximum number of wrong pixels must be <= 0.01% of number of elements,
+    // this metric is necessary due to rounding errors between different
+    // backends for AF_INTERP_NEAREST and AF_INTERP_LOWER
+    const size_t maxErr = n * 0.0001f;
+    size_t err = 0;
+
+    for (dim_t elIter = 0; elIter < n; elIter++) {
+        err += fabs((int)h_out_img[elIter] - h_gold_img[elIter]) > thr;
+        if (err > maxErr)
+            ASSERT_LE(err, maxErr) << "at: " << elIter << std::endl;
+    }
+
+    delete[] h_gold_img;
+    delete[] h_out_img;
+}
diff --git a/test/transform_coordinates.cpp b/test/transform_coordinates.cpp
new file mode 100644
index 0000000000..7f1ac4e893
--- /dev/null
+++ b/test/transform_coordinates.cpp
@@ -0,0 +1,118 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <vector>
+#include <iostream>
+#include <string>
+#include <testHelpers.hpp>
+
+using std::vector;
+using std::string;
+using std::cout;
+using std::endl;
+
+template<typename T>
+class TransformCoordinates : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {}
+};
+
+typedef ::testing::Types<float, double> TestTypes;
+
+TYPED_TEST_CASE(TransformCoordinates, TestTypes);
+
+template<typename T>
+void transformCoordinatesTest(string pTestFile)
+{
+    if (noDoubleTests<T>()) return;
+
+    vector<af::dim4>       inDims;
+    vector<vector<T> >     in;
+    vector<vector<float> > gold;
+
+    readTests<T, float, float>(pTestFile, inDims, in, gold);
+
+    af_array tfArray = 0;
+    af_array outArray = 0;
+    ASSERT_EQ(AF_SUCCESS, af_create_array(&tfArray, &(in[0].front()), inDims[0].ndims(), inDims[0].get(), (af_dtype)af::dtype_traits<T>::af_type));
+
+    size_t nTests = in.size();
+
+    for (int test = 1; test < nTests; test++) {
+        dim_t d0 = (dim_t)in[test][0];
+        dim_t d1 = (dim_t)in[test][1];
+
+        ASSERT_EQ(AF_SUCCESS, af_transform_coordinates(&outArray, tfArray, d0, d1));
+
+        // Get result
+        dim_t outEl = 0;
+        ASSERT_EQ(AF_SUCCESS, af_get_elements(&outEl, outArray));
+        T* outData = new T[outEl];
+        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray));
+
+        const float thr = 1.f;
+
+        for (size_t elIter = 0; elIter < outEl; elIter++) {
+            ASSERT_LE(fabs(outData[elIter] - gold[test-1][elIter]), thr) << "at: " << elIter << std::endl;
+        }
+
+        delete[] outData;
+    }
+
+    if(tfArray  != 0) af_release_array(tfArray);
+    if(outArray != 0) af_release_array(outArray);
+}
+
+TYPED_TEST(TransformCoordinates, RotateMatrix)
+{
+    transformCoordinatesTest<TypeParam>(string(TEST_DIR"/transformCoordinates/rotate_matrix.test"));
+}
+
+TYPED_TEST(TransformCoordinates, 3DMatrix)
+{
+    transformCoordinatesTest<TypeParam>(string(TEST_DIR"/transformCoordinates/3d_matrix.test"));
+}
+
+///////////////////////////////////// CPP ////////////////////////////////
+//
+TEST(TransformCoordinates, CPP)
+{
+    vector<af::dim4>       inDims;
+    vector<vector<float> > in;
+    vector<vector<float> > gold;
+
+    readTests<float, float, float>(TEST_DIR"/transformCoordinates/3d_matrix.test",inDims,in,gold);
+
+    af::array tf = af::array(inDims[0][0], inDims[0][1], &(in[0].front()));
+
+    float d0 = in[1][0];
+    float d1 = in[1][1];
+
+    af::array out = af::transformCoordinates(tf, d0, d1);
+
+    af::dim4 outDims = out.dims();
+
+    float* h_out = new float[outDims[0] * outDims[1]];
+    out.host(h_out);
+
+    const size_t n = gold[0].size();
+
+    const float thr = 1.f;
+
+    for (size_t elIter = 0; elIter < n; elIter++) {
+        ASSERT_LE(fabs(h_out[elIter] - gold[0][elIter]), thr) << "at: " << elIter << std::endl;
+    }
+
+    delete[] h_out;
+}
diff --git a/test/translate.cpp b/test/translate.cpp
index 5b00c04ec8..355d30a553 100644
--- a/test/translate.cpp
+++ b/test/translate.cpp
@@ -20,6 +20,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/transpose.cpp b/test/transpose.cpp
index 6be1ba49ab..8437a12615 100644
--- a/test/transpose.cpp
+++ b/test/transpose.cpp
@@ -17,6 +17,7 @@
 
 using std::string;
 using std::vector;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
diff --git a/test/triangle.cpp b/test/triangle.cpp
index e0b609b9ab..6322070226 100644
--- a/test/triangle.cpp
+++ b/test/triangle.cpp
@@ -23,6 +23,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 using af::dim4;
diff --git a/test/where.cpp b/test/where.cpp
index eb21e0d6dc..08ed878aea 100644
--- a/test/where.cpp
+++ b/test/where.cpp
@@ -78,17 +78,6 @@ void whereTest(string pTestFile, bool isSubRef=false, const vector<af_seq> seqv=
     if(tempArray != 0) af_release_array(tempArray);
 }
 
-vector<af_seq> init_subs()
-{
-    vector<af_seq> subs;
-    subs.push_back(af_make_seq(2, 6, 1));
-    subs.push_back(af_make_seq(1, 5, 1));
-    subs.push_back(af_make_seq(1, 3, 1));
-    subs.push_back(af_make_seq(1, 2, 1));
-    return subs;
-}
-
-
 #define WHERE_TESTS(T)                          \
     TEST(Where,Test_##T)                        \
     {                                           \
@@ -132,3 +121,10 @@ TYPED_TEST(Where, CPP)
                                                         << std::endl;
     }
 }
+
+TEST(Where, ISSUE_1259)
+{
+    af::array a = af::randu(10, 10, 10);
+    af::array indices = af::where(a > 2);
+    ASSERT_EQ(indices.elements(), 0);
+}
diff --git a/test/wrap.cpp b/test/wrap.cpp
index 0cc6fab909..091c5341c1 100644
--- a/test/wrap.cpp
+++ b/test/wrap.cpp
@@ -23,6 +23,7 @@ using std::vector;
 using std::string;
 using std::cout;
 using std::endl;
+using std::abs;
 using af::cfloat;
 using af::cdouble;
 
@@ -41,27 +42,27 @@ typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl
 TYPED_TEST_CASE(Wrap, TestTypes);
 
 template<typename T>
-double get_val(T val)
+inline double get_val(T val)
 {
     return val;
 }
 
-template<> double get_val<cfloat>(cfloat val)
+template<> inline double get_val<cfloat>(cfloat val)
 {
     return abs(val);
 }
 
-template<> double get_val<cdouble>(cdouble val)
+template<> inline double get_val<cdouble>(cdouble val)
 {
     return abs(val);
 }
 
-template<> double get_val<unsigned char>(unsigned char val)
+template<> inline double get_val<unsigned char>(unsigned char val)
 {
     return ((int)(val)) % 256;
 }
 
-template<> double get_val<char>(char val)
+template<> inline double get_val<char>(char val)
 {
     return (val != 0);
 }