diff --git a/ArrayFireConfig.cmake.in b/ArrayFireConfig.cmake.in
index 3ffd8e0c51..c34b5a22c4 100644
--- a/ArrayFireConfig.cmake.in
+++ b/ArrayFireConfig.cmake.in
@@ -9,12 +9,14 @@
 #
 # ----------------------------------------------------------------------------
 #
-# ArrayFire_CPU_FOUND        - True of the ArrayFire CPU library has been found.
-# ArrayFire_CPU_LIBRARIES    - Location of ArrayFire's CPU library, if found
-# ArrayFire_CUDA_FOUND       - True of the ArrayFire CUDA library has been found.
-# ArrayFire_CUDA_LIBRARIES   - Location of ArrayFire's CUDA library, if found
-# ArrayFire_OpenCL_FOUND     - True of the ArrayFire OpenCL library has been found.
-# ArrayFire_OpenCL_LIBRARIES - Location of ArrayFire's OpenCL library, if found
+# ArrayFire_CPU_FOUND         - True of the ArrayFire CPU library has been found.
+# ArrayFire_CPU_LIBRARIES     - Location of ArrayFire's CPU library, if found
+# ArrayFire_CUDA_FOUND        - True of the ArrayFire CUDA library has been found.
+# ArrayFire_CUDA_LIBRARIES    - Location of ArrayFire's CUDA library, if found
+# ArrayFire_OpenCL_FOUND      - True of the ArrayFire OpenCL library has been found.
+# ArrayFire_OpenCL_LIBRARIES  - Location of ArrayFire's OpenCL library, if found
+# ArrayFire_Unified_FOUND     - True of the ArrayFire Unified library has been found.
+# ArrayFire_Unified_LIBRARIES - Location of ArrayFire's Unified library, if found
 #
 #=============================================================================
 # Copyright (c) 2015, ArrayFire
@@ -48,17 +50,23 @@
 
 get_filename_component(ArrayFire_INCLUDE_DIRS "@INCLUDE_DIR@" ABSOLUTE)
 
-# keep in the backends in the slowest to fastest order
-foreach(backend CPU OpenCL CUDA)
-  string(TOLOWER "${backend}" lowerbackend)
+macro(find_backend backend libname)
   set(targetFile ${CMAKE_CURRENT_LIST_DIR}/@BACKEND_DIR@/ArrayFire${backend}.cmake)
   if(EXISTS ${targetFile})
     include(${targetFile})
     set(ArrayFire_${backend}_FOUND ON)
-    set(ArrayFire_${backend}_LIBRARIES af${lowerbackend})
+    set(ArrayFire_${backend}_LIBRARIES af${libname})
     # set the default backend
-    set(ArrayFire_LIBRARIES af${lowerbackend})
+    set(ArrayFire_LIBRARIES af${libname})
   else()
     set(ArrayFire_${backend}_FOUND OFF)
   endif()
+endmacro()
+
+# keep in the backends in the slowest to fastest order
+foreach(backend CPU OpenCL CUDA)
+  string(TOLOWER "${backend}" lowerbackend)
+  find_backend("${backend}" "${lowerbackend}")
 endforeach()
+
+find_backend("Unified" "")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aee379cf43..c76ef4b430 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,6 +35,8 @@ OPTION(BUILD_SIFT "Build ArrayFire nonfree algorithms" OFF)
 
 MARK_AS_ADVANCED(BUILD_SIFT)
 
+OPTION(BUILD_UNIFIED "Build Backend-Independent ArrayFire API" ON)
+
 # Set a default build type if none was specified
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
@@ -163,6 +165,11 @@ IF(${BUILD_OPENCL})
     ADD_SUBDIRECTORY(src/backend/opencl)
 ENDIF()
 
+IF(${BUILD_UNIFIED})
+    ADD_DEFINITIONS(-DAF_UNIFIED)
+    ADD_SUBDIRECTORY(src/api/unified)
+ENDIF()
+
 IF(${BUILD_DOCS})
     ADD_SUBDIRECTORY(docs)
 ENDIF()
diff --git a/CMakeModules/Version.cmake b/CMakeModules/Version.cmake
index 7dca44e778..3a474d1755 100644
--- a/CMakeModules/Version.cmake
+++ b/CMakeModules/Version.cmake
@@ -2,8 +2,8 @@
 # Make a version file that includes the ArrayFire version and git revision
 #
 SET(AF_VERSION_MAJOR "3")
-SET(AF_VERSION_MINOR "1")
-SET(AF_VERSION_PATCH "3")
+SET(AF_VERSION_MINOR "2")
+SET(AF_VERSION_PATCH "0")
 
 SET(AF_VERSION "${AF_VERSION_MAJOR}.${AF_VERSION_MINOR}.${AF_VERSION_PATCH}")
 SET(AF_API_VERSION_CURRENT ${AF_VERSION_MAJOR}${AF_VERSION_MINOR})
diff --git a/CMakeModules/build_clBLAS.cmake b/CMakeModules/build_clBLAS.cmake
index faa415185e..d0a9e135bf 100644
--- a/CMakeModules/build_clBLAS.cmake
+++ b/CMakeModules/build_clBLAS.cmake
@@ -14,7 +14,7 @@ ENDIF()
 ExternalProject_Add(
     clBLAS-external
     GIT_REPOSITORY https://github.com/arrayfire/clBLAS.git
-    GIT_TAG 47662a6ac1186c756508109d7fef8827efab4504
+    GIT_TAG 102c832825e8e4d60ad73ca97e95668463294068
     PREFIX "${prefix}"
     INSTALL_DIR "${prefix}"
     UPDATE_COMMAND ""
diff --git a/CMakeModules/build_forge.cmake b/CMakeModules/build_forge.cmake
index 5784b76f0f..21b8aac8ad 100644
--- a/CMakeModules/build_forge.cmake
+++ b/CMakeModules/build_forge.cmake
@@ -22,7 +22,7 @@ ENDIF()
 ExternalProject_Add(
     forge-ext
     GIT_REPOSITORY https://github.com/arrayfire/forge.git
-    GIT_TAG af3.1.2
+    GIT_TAG af3.2.0
     PREFIX "${prefix}"
     INSTALL_DIR "${prefix}"
     UPDATE_COMMAND ""
diff --git a/CMakeModules/osx_install/OSXInstaller.cmake b/CMakeModules/osx_install/OSXInstaller.cmake
index 4a1fc97845..dc3a8b2491 100644
--- a/CMakeModules/osx_install/OSXInstaller.cmake
+++ b/CMakeModules/osx_install/OSXInstaller.cmake
@@ -76,7 +76,7 @@ PKG_BUILD(  PKG_NAME        ArrayFireCPU
             SCRIPT_DIR      ${OSX_INSTALL_DIR}/cpu_scripts
             IDENTIFIER      com.arrayfire.pkg.arrayfire.cpu.lib
             PATH_TO_FILES   package/lib
-            FILTERS         opencl cuda)
+            FILTERS         opencl cuda unified)
 
 PKG_BUILD(  PKG_NAME        ArrayFireCUDA
             DEPENDS         afcuda
@@ -85,7 +85,7 @@ PKG_BUILD(  PKG_NAME        ArrayFireCUDA
             SCRIPT_DIR      ${OSX_INSTALL_DIR}/cuda_scripts
             IDENTIFIER      com.arrayfire.pkg.arrayfire.cuda.lib
             PATH_TO_FILES   package/lib
-            FILTERS         cpu opencl)
+            FILTERS         cpu opencl unified)
 
 PKG_BUILD(  PKG_NAME        ArrayFireOPENCL
             DEPENDS         afopencl
@@ -93,7 +93,15 @@ PKG_BUILD(  PKG_NAME        ArrayFireOPENCL
             INSTALL_LOCATION /usr/local/lib
             IDENTIFIER      com.arrayfire.pkg.arrayfire.opencl.lib
             PATH_TO_FILES   package/lib
-            FILTERS         cpu cuda)
+            FILTERS         cpu cuda unified)
+
+PKG_BUILD(  PKG_NAME        ArrayFireUNIFIED
+            DEPENDS         af
+            TARGETS         unified_package
+            INSTALL_LOCATION /usr/local/lib
+            IDENTIFIER      com.arrayfire.pkg.arrayfire.unified.lib
+            PATH_TO_FILES   package/lib
+            FILTERS         cpu cuda opencl)
 
 PKG_BUILD(  PKG_NAME        ArrayFireHeaders
             TARGETS         header_package
@@ -107,5 +115,5 @@ PKG_BUILD(  PKG_NAME        ArrayFireExtra
             IDENTIFIER      com.arrayfire.pkg.arrayfire.extra
             PATH_TO_FILES   package/share)
 
-PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${header_package} ${extra_package})
+PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${header_package} ${extra_package})
 
diff --git a/CMakeModules/osx_install/distribution.dist b/CMakeModules/osx_install/distribution.dist
index 6fe9ba09cb..3dc82379c9 100644
--- a/CMakeModules/osx_install/distribution.dist
+++ b/CMakeModules/osx_install/distribution.dist
@@ -4,32 +4,55 @@
     <welcome    file="${WELCOME_FILE_OUT}" />
     <readme     file="${README_FILE_OUT}" mime-type="test/html" />
     <license    file="${CMAKE_MODULE_PATH}/../LICENSE" mime-type="test/plain" />
+    <script>
+      function CheckBackendSelected() {
+      return choices.opencl_lib.selected ||
+            choices.cuda_lib.selected ||
+            choices.cpu_lib.selected;
+      }
+    </script>
 
     <pkg-ref id="com.arrayfire.arrayfire.cpu.lib"       version="${AF_VERSION}" onConclusion="none">ArrayFireCPU.pkg</pkg-ref>
     <pkg-ref id="com.arrayfire.arrayfire.cuda.lib"      version="${AF_VERSION}" onConclusion="none">ArrayFireCUDA.pkg</pkg-ref>
     <pkg-ref id="com.arrayfire.arrayfire.opencl.lib"    version="${AF_VERSION}" onConclusion="none">ArrayFireOPENCL.pkg</pkg-ref>
+    <pkg-ref id="com.arrayfire.arrayfire.unified.lib"   version="${AF_VERSION}" onConclusion="none">ArrayFireUNIFIED.pkg</pkg-ref>
     <pkg-ref id="com.arrayfire.arrayfire.inc"           version="${AF_VERSION}" onConclusion="none">ArrayFireHeaders.pkg</pkg-ref>
     <pkg-ref id="com.arrayfire.arrayfire.extra"         version="${AF_VERSION}" onConclusion="none">ArrayFireExtra.pkg</pkg-ref>
     <options customize="always" require-scripts="false"/>
     <choices-outline>
         <line choice="libs">
-            <line choice="com.arrayfire.arrayfire.cpu.lib"/>
-            <line choice="com.arrayfire.arrayfire.cuda.lib"/>
-            <line choice="com.arrayfire.arrayfire.opencl.lib"/>
+            <line choice="cpu_lib"/>
+            <line choice="cuda_lib"/>
+            <line choice="opencl_lib"/>
+            <line choice="com.arrayfire.arrayfire.unified.lib"/>
         </line>
         <line choice="com.arrayfire.arrayfire.inc"/>
         <line choice="com.arrayfire.arrayfire.extra"/>
     </choices-outline>
     <choice id="libs" title="ArrayFire Libraries" visible="true" />
-    <choice title="CPU Libraries" description="CPU Libraries" id="com.arrayfire.arrayfire.cpu.lib" visible="true" enabled="true">
+    <choice title="CPU Libraries"
+            description="ArrayFire targeting CPUs."
+            id="cpu_lib" visible="true" enabled="true">
         <pkg-ref id="com.arrayfire.arrayfire.cpu.lib"/>
     </choice>
-    <choice title="CUDA Libraries" description="CUDA Libraries" id="com.arrayfire.arrayfire.cuda.lib" visible="true" enabled="true">
+    <choice title="CUDA Libraries"
+            description="ArrayFire which targets the CUDA platform. This platform allows you to to take advantage of the CUDA enabled GPUs to run ArrayFire code."
+            id="cuda_lib" visible="true" enabled="true">
         <pkg-ref id="com.arrayfire.arrayfire.cuda.lib"/>
     </choice>
-    <choice title="OpenCL Libraries" description="OpenCL Libraries" id="com.arrayfire.arrayfire.opencl.lib" visible="true" enabled="true">
+    <choice title="OpenCL Libraries"
+            description="ArrayFire which targets the OpenCL platform. This platform allows you to use the ArrayFire library which targets OpenCL devices. NOTE: Currently ArrayFire does not support OpenCL for the Intel CPU on Apple."
+            id="opencl_lib" visible="true" enabled="true">
         <pkg-ref id="com.arrayfire.arrayfire.opencl.lib"/>
     </choice>
+    <choice title="Unified Library"
+            description="This library will allow you to choose the platform(cpu, cuda, opencl) at runtime. NOTE: This option requires the other platforms to work properly"
+            id="com.arrayfire.arrayfire.unified.lib"
+            selected="CheckBackendSelected()"
+            visible="true"
+            enabled="CheckBackendSelected()">
+        <pkg-ref id="com.arrayfire.arrayfire.unified.lib"/>
+    </choice>
     <choice title="ArrayFire Headers" description="ArrayFire Headers" id="com.arrayfire.arrayfire.inc" visible="true" enabled="true">
         <pkg-ref id="com.arrayfire.arrayfire.inc"/>
     </choice>
diff --git a/README.md b/README.md
index b5aa3b0eef..695adbed03 100644
--- a/README.md
+++ b/README.md
@@ -20,12 +20,10 @@ ArrayFire binary installers can be downloaded at the [ArrayFire Downloads](http:
 * Email: <mailto:technical@arrayfire.com>
 
 ### Build Status
-|                 | Build           | Tests           |
-|-----------------|-----------------|-----------------|
-| Linux x86       | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/devel)](http://ci.arrayfire.org/job/arrayfire-linux/branch/devel/)      | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux-test/devel)](http://ci.arrayfire.org/job/arrayfire-linux-test/branch/devel/)              |
-| Linux Tegra     | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegra/devel)](http://ci.arrayfire.org/job/arrayfire-tegra/branch/devel/)      | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegra-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegra-test/branch/devel/)              |
-| Windows         | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/devel)](http://ci.arrayfire.org/job/arrayfire-windows/branch/devel/)  | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows-test/devel)](http://ci.arrayfire.org/job/arrayfire-windows-test/branch/devel/)          |
-| OSX             | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/devel)](http://ci.arrayfire.org/job/arrayfire-osx/branch/devel/)          | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx-test/devel)](http://ci.arrayfire.org/job/arrayfire-osx-test/branch/devel/)                  |
+|         | Linux x86 | Linux armv7l | Linux aarch64 | Windows | OSX |
+|:-------:|:---------:|:------------:|:-------------:|:-------:|:---:|
+| Build   | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/devel)](http://ci.arrayfire.org/job/arrayfire-linux/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/devel)](http://ci.arrayfire.org/job/arrayfire-windows/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/devel)](http://ci.arrayfire.org/job/arrayfire-osx/branch/devel/) |
+| Test    | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux-test/devel)](http://ci.arrayfire.org/job/arrayfire-linux-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows-test/devel)](http://ci.arrayfire.org/job/arrayfire-windows-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx-test/devel)](http://ci.arrayfire.org/job/arrayfire-osx-test/branch/devel/) |
 
 Test coverage: [![Coverage Status](https://coveralls.io/repos/arrayfire/arrayfire/badge.svg?branch=HEAD)](https://coveralls.io/r/arrayfire/arrayfire?branch=HEAD)
 
diff --git a/assets b/assets
index d5b0b7cd5d..7c2a12739a 160000
--- a/assets
+++ b/assets
@@ -1 +1 @@
-Subproject commit d5b0b7cd5d44299458696571df7fb1aa7d99701e
+Subproject commit 7c2a12739ac0f5830d26334731e9ac96ba01e2d7
diff --git a/docs/details/algorithm.dox b/docs/details/algorithm.dox
index d2d0d50bd7..a823572b59 100644
--- a/docs/details/algorithm.dox
+++ b/docs/details/algorithm.dox
@@ -15,6 +15,15 @@ This function performs the operation across all batches present in the input sim
 
 Find the sum of values in the input
 
+This table defines the return value types for the corresponding input types
+
+Input Type          | Output Type
+--------------------|---------------------
+f32, f64, c32, c64  | same as input
+s32, u32, s64, u64  | same as input
+s16                 | s32
+u16, u8, b8         | u32
+
 \copydoc batch_detail_algo
 
 
@@ -25,6 +34,15 @@ Find the sum of values in the input
 
 Find the product of values in the input
 
+This table defines the return value types for the corresponding input types
+
+Input Type          | Output Type
+--------------------|---------------------
+f32, f64, c32, c64  | same as input
+s32, u32, s64, u64  | same as input
+s16                 | s32
+u16, u8, b8         | u32
+
 \copydoc batch_detail_algo
 
 
@@ -55,6 +73,8 @@ Find the maximum values and their locations
 
 Find if of all of the values in input are true
 
+Return type is b8 for all input types
+
 \copydoc batch_detail_algo
 
 
@@ -65,6 +85,8 @@ Find if of all of the values in input are true
 
 Find if of any of the values in input are true
 
+Return type is b8 for all input types
+
 \copydoc batch_detail_algo
 
 
@@ -75,6 +97,8 @@ Find if of any of the values in input are true
 
 Count the number of non-zero elements in the input
 
+Return type is u32 for all input types
+
 \copydoc batch_detail_algo
 
 
@@ -85,6 +109,15 @@ Count the number of non-zero elements in the input
 
 Perform exclusive sum along specified dimension
 
+This table defines the return value types for the corresponding input types
+
+Input Type          | Output Type
+--------------------|---------------------
+f32, f64, c32, c64  | same as input
+s32, u32, s64, u64  | same as input
+s16                 | s32
+u16, u8, b8         | u32
+
 \copydoc batch_detail_algo
 
 
@@ -95,6 +128,8 @@ Perform exclusive sum along specified dimension
 
 Locate the indices of non-zero elements
 
+Return type is u32 for all input types
+
 The locations are provided by flattening the input into a linear array.
 
 
@@ -135,7 +170,8 @@ Sort an multi dimensional array
 
 Sort input arrays get the sorted indices
 
-Sort a multi dimensional array and return sorted indices
+Sort a multi dimensional array and return sorted indices. Index array is of
+type u32.
 
 
 
diff --git a/docs/details/backend.dox b/docs/details/backend.dox
new file mode 100644
index 0000000000..fafa453e6f
--- /dev/null
+++ b/docs/details/backend.dox
@@ -0,0 +1,66 @@
+/**
+\addtogroup arrayfire_func
+@{
+
+\defgroup unified_func_setbackend setBackend
+
+\brief Set the current backend when using Unified backend
+
+This is a noop when using one of CPU, CUDA, or OpenCL backend.
+
+However, when using on of those 3 but trying to set it to a different backend
+will return in an exception.
+
+\ingroup unified_func
+\ingroup arrayfire_func
+
+=======================================================================
+
+\defgroup unified_func_getbackendcount getBackendCount
+
+\brief Get the number of backends whose libraries were successfully loaded.
+
+This will be between 0-3. 0 Being no backends were loaded and 3 being all
+backends loaded successfully.
+
+\ingroup unified_func
+\ingroup arrayfire_func
+
+=======================================================================
+
+\defgroup unified_func_getavailbackends getAvailableBackends
+
+\brief Returns an integer indicating the backends loaded successfully.
+
+The number returned denotes the backends available according to the table:
+
+Return Value | Backends Available
+-------------|-----------------------
+0            | None
+1            | CPU
+2            | CUDA
+3            | CPU and CUDA
+4            | OpenCL
+5            | CPU and OpenCL
+6            | CUDA and OpenCL
+7            | CPU, CUDA and OpenCL
+
+\ingroup unified_func
+\ingroup arrayfire_func
+
+=======================================================================
+
+\defgroup unified_func_getbackendid getBackendId
+
+\brief Get's the backend enum for an array
+
+This will return one of the values from the \ref af_backend enum.
+The return value specifies which backend the array was created on.
+
+\ingroup unified_func
+\ingroup arrayfire_func
+
+=======================================================================
+
+@}
+*/
diff --git a/docs/details/image.dox b/docs/details/image.dox
index 4e1b0a5cdc..234f4f72e9 100644
--- a/docs/details/image.dox
+++ b/docs/details/image.dox
@@ -329,6 +329,9 @@ distance as well as the color distance.
 The bilateral filter requires the size of the filter (in pixels) and the upper
 bound on color values, N, where pixel values range from 0–N inclusively.
 
+The return type of the array is f64 for f64 input, f32 for all other input
+types.
+
 =======================================================================
 
 \defgroup image_func_erode erode
diff --git a/docs/details/index.dox b/docs/details/index.dox
index 85386b25db..90b9924d5e 100644
--- a/docs/details/index.dox
+++ b/docs/details/index.dox
@@ -14,6 +14,12 @@
 
 \brief Copy and write values in the locations specified by the sequences
 
+\ingroup index_mat
+
+\defgroup index_func_util util
+
+\brief Utility functions to create objects of type \ref af_index_t
+
 \ingroup index_mat
 @}
 */
diff --git a/docs/details/vision.dox b/docs/details/vision.dox
index af0f11437e..1d9d6b99ac 100644
--- a/docs/details/vision.dox
+++ b/docs/details/vision.dox
@@ -90,6 +90,29 @@ before using it, make sure you have the appropriate permission to do so.
 
 =======================================================================
 
+\defgroup cv_func_gloh gloh
+\ingroup featdescriptor_mat
+
+\brief SIFT feature detector and GLOH descriptor extractor
+
+Detects features using the Scale Invariant Feature Transform (SIFT),
+by David Lowe. Descriptors are extracted using Gradient Location and
+Orientation Histogram (GLOH).
+
+Lowe, D. G., "Distinctive Image Features from Scale-Invariant Keypoints",
+International Journal of Computer Vision, 60, 2, pp. 91-110, 2004.
+
+Mikolajczyk, K., and Schmid, C., "A performance evaluation of local
+descriptors", IEEE Transactions on Pattern Analysis and Machine Intelligence,
+10, 27, pp. 1615-1630, 2005.
+
+WARNING: Although GLOH is free of patents, the SIFT algorithm, used to detect
+features that will later be used by GLOH descriptors, is patented by the
+University of British Columbia, before using it, make sure you have the
+appropriate permission to do so.
+
+=======================================================================
+
 \defgroup cv_func_hamming_matcher hammingMatcher
 \ingroup featmatcher_mat
 
@@ -147,6 +170,30 @@ Template matching is an image processing technique to find small patches of an i
 match a given template image. A more in depth discussion on the topic can be found
 [here](http://en.wikipedia.org/wiki/Template_matching).
 
+=======================================================================
+
+\defgroup cv_func_homography homography
+\ingroup homography_mat
+
+\brief Homography Estimation
+
+Homography estimation find a perspective transform between two sets of 2D points.
+Currently, two methods are supported for the estimation, RANSAC (RANdom SAmple Consensus)
+and LMedS (Least Median of Squares). Both methods work by randomly selecting a subset
+of 4 points of the set of source points, computing the eigenvectors of that set and
+finding the perspective transform. The process is repeated several times, a maximum of
+times given by the value passed to the iterations arguments for RANSAC (for the CPU
+backend, usually less than that, depending on the quality of the dataset, but for CUDA
+and OpenCL backends the transformation will be computed exactly the amount of times
+passed via the iterations parameter), the returned value is the one that matches the
+best number of inliers, which are all of the points that fall within a maximum L2
+distance from the value passed to the inlier_thr argument. For the LMedS case, the
+number of iterations is currently hardcoded to meet the following equation:
+
+\f$ m = \frac{log(1 - P)}{log[1 - {(1 - \epsilon)}^{p}]}\f$,
+
+where \f$ P = 0.99\f$, \f$ \epsilon = 40\%\f$ and \f$ p = 4\f$.
+
 
 
 @}
diff --git a/docs/layout.xml b/docs/layout.xml
index d637c7f55a..3a66b563e4 100644
--- a/docs/layout.xml
+++ b/docs/layout.xml
@@ -3,10 +3,12 @@
   <navindex>
     <tab type="mainpage" visible="yes" title="" />
     <tab type="usergroup" visible="yes" title="Tutorials">
+      <tab type="user" url="\ref installing" visible="yes" title="Installation"/>
       <tab type="user" url="\ref using_on_linux" visible="yes" title="Using on Linux"/>
       <tab type="user" url="\ref using_on_windows" visible="yes" title="Using on Windows"/>
       <tab type="user" url="\ref using_on_osx" visible="yes" title="Using on OSX"/>
       <tab type="user" url="\ref gettingstarted" visible="yes" title="Getting Started"/>
+      <tab type="user" url="\ref unifiedbackend" visible="yes" title="Unified Backend"/>
       <tab type="user" url="\ref matrixmanipulation" visible="yes" title="Matrix Manipulation"/>
       <tab type="user" url="\ref indexing" visible="yes" title="Indexing"/>
       <tab type="user" url="\ref timing" visible="yes" title="Timing ArrayFire"/>
diff --git a/docs/pages/INSTALL.md b/docs/pages/INSTALL.md
index 3d9983aff9..dabb10b318 100644
--- a/docs/pages/INSTALL.md
+++ b/docs/pages/INSTALL.md
@@ -2,12 +2,21 @@ ArrayFire binary installation instructions {#installing}
 =====
 
 Installing ArrayFire couldn't be easier. We ship installers for Windows,
-OSX, and several variants of Linux. In general the installation procedure
-proceeds like this:
+OSX, and Linux. Although you could
+[build ArrayFire from source](https://github.com/arrayfire/arrayfire), we
+suggest using our pre-compiled binaries as they include the Intel Math
+Kernel Library to accelerate linear algebra functions.
 
-1. [Download](http://arrayfire.com/download/) the ArrayFire installer for your
+Please note that although our download page requires a valid login, registration
+is free and downloading ArrayFire is also free. We request your contact
+information so that we may notify you of software updates and occasionally
+collect user feedback about our library.
+
+In general, the installation process for ArrayFire looks like this:
+
+1. Install prerequisites
+2. [Download](http://arrayfire.com/download/) the ArrayFire installer for your
    operating system
-2. Install prerequisites
 3. Install ArrayFire
 4. Test the installation
 5. [Where to go for help?](#GettingHelp)
@@ -16,107 +25,138 @@ Below you will find instructions for
 
 * [Windows](#Windows)
 * Linux including
-    * [Debian (.deb) 8](#Debian)
-    * [Ubuntu (.deb) 14.10 and later](#Ubuntu)
-    * [Fedora (.rpm) 21](#Fedora)
+    * [Debian 8](#Debian)
+    * [Ubuntu 14.04 and later](#Ubuntu)
+    * [RedHat, Fedora, and CentOS](#RPM-distros)
 * [Mac OSX (.sh and brew)](#OSX)
 
 # <a name="Windows"></a> Windows
 
-Simply [download](http://arrayfire.com/download/) and run the installer.
 If you wish to use CUDA or OpenCL please ensure that you have also installed
 support for these technologies from your video card vendor's website.
 
+Next [download](http://arrayfire.com/download/) and run the ArrayFire installer.
+After it has completed, you need to add ArrayFire to the path for all users.
+
+1. Open Advanced System Settings:
+    * Windows 8: Move the Mouse pointer to the bottom right corner of the
+      screen, Right click, choose System. Then click "Advanced System Settings"
+    * Windows 7: Open the Start Menu and Right Click on "Computer". Then choose
+      Properties and click "Advanced System Settings"
+2. In Advanced System Settings window, click on Advanced tab
+3. Click on Environment Variables, then under System Variables, find PATH, and
+   click on it.
+4. In edit mode, append %AF_PATH%/lib. NOTE: Ensure that there is a semi-colon
+   separating %AF_PATH%/lib from any existing content (e.g.
+   EXISTING_PATHS;%AF_PATH%/lib;) otherwise other software may not function
+   correctly.
+
+Finally, verify that the path addition worked correctly. You can do this by:
+
+1. Open Visual Studio 2013. Open the HelloWorld solution which is located at
+   `%AF_PATH%/examples/helloworld/helloworld.exe`.
+2. Build and run the helloworld example. Be sure to, select the
+   platform/configuration of your choice using the platform drop-down (the
+   options are CPU, CUDA, and OpenCL) and Solution Configuration drop down
+   (options of Release and Debug) menus. Run the helloworld example
+
 # Linux
 
 ## <a name="Debian"></a> Debian 8
 
-First [download](http://arrayfire.com/download/) ArrayFire. Then, using the
-`gdebi` package manager, you can install ArrayFire and all dependencies as
-follows:
-
-    gdebi arrayfire*.deb
-
-If you prefer to use the `.sh` installer, it and all prerequisite packages
-may be installed as follows:
+First install the prerequisite packages:
 
     # Prerequisite packages:
-    apt-get install libfreeimage-dev libatlas3gf-base libfftw3-dev cmake
+    apt-get install libfreeimage-dev libatlas3gf-base libfftw3-dev libglew-dev libglewmx-dev libglfw3-dev cmake
 
     # Enable GPU support (OpenCL):
     apt-get install ocl-icd-libopencl1
 
-    # Run Installer
-    ./arrayfire_3.0.0_Linux_x86_64.sh --exclude-subdir --prefix=/usr/local
+If you wish to use CUDA, please
+[download the latest version of CUDA](https://developer.nvidia.com/cuda-zone)
+and install it on your system.
 
-To enable CUDA support, edit `/etc/apt/sources.list` and append `non-free`
-to the line containing `deb http://.../debian jessie main`. Then, as root, run
+Next [download](http://arrayfire.com/download/) ArrayFire. After you have the
+file, run the installer.
 
-    apt-get update
-    apt-get install nvidia-cuda-dev
+    ./arrayfire_*_Linux_x86_64.sh --exclude-subdir --prefix=/usr/local
 
-## <a name="Fedora"></a> Fedora 21
+## <a name="RPM-distros"></a> RedHat, Fedora, and CentOS
 
-First [download](http://arrayfire.com/download/) ArrayFire. Then, using the
-`yum` package manager, you can install ArrayFire and all dependencies as
-follows:
-
-    yum --nogpgcheck localinstall arrayfire*.rpm
-
-Or with the self-extracting installer
+First install the prerequisite packages:
 
     # Install prerequiste packages
-    yum install freeimage atlas fftw cmake
+    yum install freeimage atlas fftw libGLEW libGLEWmx glfw cmake
 
-    # Run Installer
-    ./arrayfire_3.0.0_Linux_x86_64.sh --exclude-subdir --prefix=/usr/local
+On Centos and Redhat the `glfw` package is outdated and you will need to compile
+it from source. Please
+[these instructions](https://github.com/arrayfire/arrayfire/wiki/GLFW-for-ArrayFire).
 
-## <a name="Ubuntu"></a> Ubuntu 14.10 and later
+If you wish to use CUDA, please
+[download the latest version of CUDA](https://developer.nvidia.com/cuda-downloads)
+and install it on your system.
 
-First [download](http://arrayfire.com/download/) ArrayFire. Then, using the
-`gdebi` package manager, you can install ArrayFire and all dependencies as
-follows:
+Next [download](http://arrayfire.com/download/) ArrayFire. After you have the
+file, run the installer.
 
-    sudo apt-get install gdebi
-    gdebi arrayfire*.deb
+    ./arrayfire_*_Linux_x86_64.sh --exclude-subdir --prefix=/usr/local
 
-If you prefer to use the `.sh` installer, it and all prerequisite packages
-may be installed as follows:
+## <a name="Ubuntu"></a> Ubuntu 14.04 and later
+
+First install the prerequisite packages:
 
     # Prerequisite packages:
     sudo apt-get install libfreeimage-dev libatlas3gf-base libfftw3-dev cmake
 
-    # Enable GPU support (OpenCL and/or CUDA):
-    sudo apt-get install ocl-icd-libopencl1
-    sudo apt-get install nvidia-cuda-dev
+Ubuntu 14.04 will not have the libglfw3-dev package in its repositories. You can either build the library from source (following the instructions listed) or install the library from a PPA as follows:
 
-    # Run Installer
-    sudo ./arrayfire_3.0.0_Linux_x86_64.sh --exclude-subdir --prefix=/usr/local
+```
+sudo apt-add repository ppa:keithw/glfw3
+sudo apt-get update
+sudo apt-get install glfw3
+```
 
-# <a name="OSX"></a> Mac OSX
+After this point, the installation should proceed identically to Ubuntu 14.10 or newer.
 
-## Self-extracting zip from ArrayFire website
+If your system has a CUDA GPU, we suggest downloading the latest drivers
+from NVIDIA in the form of a Debian package and installing using the
+package manager. At present, CUDA downloads can be found on the
+[NVIDIA CUDA download page](https://developer.nvidia.com/cuda-downloads)
+Follow NVIDIA's instructions for getting CUDA set up.
 
-On OSX there are several dependencies that are not integrated into the
-operating system. It is easiest to install these using [Homebrew](http://brew.sh/),
-but you can also build them yourself if you prefer.
+If you wish to use OpenCL, simply install the OpenCL ICD loader along
+with any drivers required for your hardware.
+
+    # Enable GPU support (OpenCL):
+    apt-get install ocl-icd-libopencl1
+
+### Special instructions for Tegra K1
+If you are using ArrayFire on the Tegra K1 also install these packages:
 
-First [download](http://arrayfire.com/download/) ArrayFire. You may install
-ArrayFire to `/usr/local` from XTerm using the following commands:
+    sudo apt-get install libatlas3gf-base libatlas-dev libfftw3-dev liblapacke-dev
 
-    brew install boost fftw cmake freeimage
+In addition to these packages, you will need to compile GLFW3 from source
+using the instructions above.
 
-    sudo ./arrayfire_3.0.0_Linux_x86_64.sh --exclude-subdir --prefix=/usr/local
+Finally, [download](http://arrayfire.com/download/) ArrayFire. After you have
+the file, run the installer using:
 
-## Brew installation
+    ./arrayfire_*_Linux_x86_64.sh --exclude-subdir --prefix=/usr/local
+
+# <a name="OSX"></a> Mac OSX
+
+On OSX there are several dependencies that are not integrated into the
+operating system. The ArrayFire installer automatically satisfies these
+dependencies using [Homebrew](http://brew.sh/).
+If you don't have Homebrew installed on your system, the ArrayFire installer
+will ask you do to so.
 
-GitHub user [sutoiku](https://github.com/sutoiku) has been kind enough to
-write a brew installation script for ArrayFire. This installation method will
-download and compile ArrayFire and all prerequisites. Please remember to
-register on the ArrayFire website so we can keep you up to date about new
-versions of our software!
+Simply [download](http://arrayfire.com/download) the ArrayFire installer
+and double-click it to carry out the installation.
 
-    brew install arrayfire
+ArrayFire can also be installed through Homebrew directly using
+`brew install arrayfire`; however, it will
+not include MKL acceleration of linear algebra functions.
 
 ## Testing installation
 
diff --git a/docs/pages/getting_started.md b/docs/pages/getting_started.md
index 451f994f60..6d1c7cdd3d 100644
--- a/docs/pages/getting_started.md
+++ b/docs/pages/getting_started.md
@@ -17,6 +17,8 @@ underlying data may be one of various [basic types](\ref af::af_dtype):
 * [c64](\ref c64) complex double-precision (`cdouble`)
 * [s64](\ref s64) 64-bit signed integer (`intl`)
 * [u64](\ref u64) 64-bit unsigned integer (`uintl`)
+* [s16](\ref s16) 16-bit signed integer (`short`)
+* [u16](\ref u16) 16-bit unsigned integer (`unsigned short`)
 
 Older devices may not support double precision operations.
 
diff --git a/docs/pages/gfor.md b/docs/pages/gfor.md
index 28410a7f18..a7ed9a195d 100644
--- a/docs/pages/gfor.md
+++ b/docs/pages/gfor.md
@@ -74,14 +74,6 @@ gfor (seq k, 0, n-1) {
 }
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
-array A = constant(1,n,n,m);
-array B = constant(1,n,n);
-gfor (seq k, 0,m-1) {
-   A(span,span,k) = A(span,span,k) * B; // matrix-matrix multiply
-}
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
 array A = randu(n,m);
 array B = constant(0,n,m);
@@ -122,30 +114,6 @@ gfor (seq ii, n)
   H(span,ii) = compute(A(span,ii), B(span,ii), ep);
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Multiplications {#gfor_mul}
----------------
-
-ArrayFire supports bulk multiplications of vector-vector, matrix-vector, and
-matrix-matrix types using GFOR. This is especially useful with many small
-matrices.
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
-array A = constant(1,n,n);
-array B = constant(1,n,1);
-array C = constant(0,n,m);
-gfor (seq k, n)
-  B(k) = A(k,span) * A(span,k); // vector-vector multiply
-
-A = constant(1,n,n,m);
-gfor (seq k, m)
-  C(span,k) = A(span,span,k) * B;  // matrix-vector multiply
-
-A = constant(1,n,n,m);
-B = constant(1,n,n);
-gfor (seq k, m)
-  A(span,span,k) = A(span,span,k) * B;  // matrix-matrix multiply
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 The Iterator {#gfor_iterator}
 ------------
 
diff --git a/docs/pages/matrix_manipulation.md b/docs/pages/matrix_manipulation.md
index 8fd7b35355..35b2b9a61f 100644
--- a/docs/pages/matrix_manipulation.md
+++ b/docs/pages/matrix_manipulation.md
@@ -2,30 +2,272 @@ Matrix Manipulation {#matrixmanipulation}
 ===================
 
 Many different kinds of [matrix manipulation routines](\ref manip_mat) are available:
-* tile() to repeat a matrix along dimensions
-* join() to concatenate two matrices along a dimension
+* flat() - flatten an array to one dimension
+* flip() - flip an array along a dimension
+* join() - join up to 4 arrays
+* moddims() - change the dimensions of an array without changing the data
+* reorder() - changes the dimension order within the array
+* shift() - shifts data along a dimension
+* tile() - repeats an array along a dimension
+* transpose() - performs a matrix transpose
 * [array()](\ref af::array) to adjust the dimensions of an array
-* [transpose](\ref af::array::T) a matrix or vector
+* [transpose](\ref af::array::T) a matrix or vector with shorthand notation
 
-tile() allows you to repeat a matrix along specified
-dimensions, effectively 'tiling' the matrix.  Please note that the
-dimensions passed in indicate the number of times to replicate the
-matrix in each dimension, not the final dimensions of the matrix.
+### flat()
+The __flat()__ function flattens an array to one dimension.
+```
+a [3 3 1 1]
+    1.0000     4.0000     7.0000
+    2.0000     5.0000     8.0000
+    3.0000     6.0000     9.0000
 
-\snippet test/matrix_manipulation.cpp ex_matrix_manipulation_tile
+flat(a) [9 1 1 1]
+    1.0000
+    2.0000
+    3.0000
+    4.0000
+    5.0000
+    6.0000
+    7.0000
+    8.0000
+    9.0000
 
-join() allows you to joining two matrices together.  Matrix
-dimensions must match along every dimension except the dimension
-of joining (dimensions are 0-indexed). For example, a 2x3 matrix
-can be joined with a 2x4 matrix along dimension 1, but not along
-dimension 0 since {3,4} don`t match up.
+```
+The flat function has the following overloads:
+* __array af::flat(const array& in)__ -- flatten an array
+* __af_err af_flat(af_array* out, const af_array in)__ -- C interface for flat() function
 
-\snippet test/matrix_manipulation.cpp ex_matrix_manipulation_join
 
-Construct a regular mesh grid from vectors `x` and `y`. For example, a
-mesh grid of the vectors {1,2,3,4} and {5,6} would result in two matrices:
+### flip()
+The __flip()__ function flips the contents of an array along a chosen dimension.
+```
+a [5 2 1 1]
+    1.0000     6.0000
+    2.0000     7.0000
+    3.0000     8.0000
+    4.0000     9.0000
+    5.0000    10.0000
 
-\snippet test/matrix_manipulation.cpp ex_matrix_manipulation_mesh
+flip(a, 0) [5 2 1 1]
+    5.0000    10.0000
+    4.0000     9.0000
+    3.0000     8.0000
+    2.0000     7.0000
+    1.0000     6.0000
+
+flip(a, 1) [5 2 1 1]
+    6.0000     1.0000
+    7.0000     2.0000
+    8.0000     3.0000
+    9.0000     4.0000
+   10.0000     5.0000
+```
+The flip function has the following overloads:
+* __array af::flip(const array &in, const unsigned dim)__ -- flips an array along a dimension 
+* __af_err af_flip(af_array *out, const af_array in, const unsigned dim)__ -- C interface for flip()
+
+### join()
+The __join()__ function can join up to 4 arrays together.
+```
+a [5 1 1 1]
+    1.0000
+    2.0000
+    3.0000
+    4.0000
+    5.0000
+
+join(0, a, a) [10 1 1 1]
+    1.0000
+    2.0000
+    3.0000
+    4.0000
+    5.0000
+    1.0000
+    2.0000
+    3.0000
+    4.0000
+    5.0000
+
+join(1, a, a) [5 2 1 1]
+    1.0000     1.0000
+    2.0000     2.0000
+    3.0000     3.0000
+    4.0000     4.0000
+    5.0000     5.0000
+```
+The join function has several overloads:
+* __array af::join(const int dim, const array &first, const array &second)__ -- Joins 2 arrays along a dimension
+
+* __array af::join(const int dim, const array &first, const array &second, const array &third)__ -- Joins 3 arrays along a dimension.
+
+* __array af::join(const int dim, const array &first, const array &second, const array &third, const array &fourth)__ -- Joins 4 arrays along a dimension
+
+* __af_err af_join(af_array *out, const int dim, const af_array first, const af_array second)__ -- C interface function to join 2 arrays along a dimension
+
+* __af_err af_join_many(af_array *out, const int dim, const unsigned n_arrays, const af_array *inputs)__ -- C interface function to join up to 10 arrays along a dimension
+
+### moddims()
+The __moddims()__ function changes the dimensions of an array without changing its data or order. It is important to remember that the function only modifies the _metadata_ associated with the array and does not actually modify the content of the array.
+```
+a [8 1 1 1]
+    1.0000
+    2.0000
+    1.0000
+    2.0000
+    1.0000
+    2.0000
+    1.0000
+    2.0000
+
+af::dim4 new_dims(2, 4);
+moddims(a, new_dims) [2 4 1 1]
+    1.0000     1.0000     1.0000     1.0000
+    2.0000     2.0000     2.0000     2.0000
+
+moddims(a, a.elements(), 1, 1, 1) [8 1 1 1]
+    1.0000
+    2.0000
+    1.0000
+    2.0000
+    1.0000
+    2.0000
+    1.0000
+    2.0000
+```
+The moddims function has several overloads:
+* __array af::moddims(const array &in, const unsigned ndims, const dim_t *const dims)__ -- mods number of dimensions to match _ndims_ as specidied in the array _dims_
+* __array af::moddims(const array &in, const dim4 &dims)__ -- mods dimensions as specified by _dims_
+* __array af::moddims(const array &in, const dim_t d0, const dim_t d1=1, const dim_t d2=1, const dim_t d3=1)__ -- mods dimensions of an array
+* __af_err af_moddims(af_array *out, const af_array in, const unsigned ndims, const dim_t *const dims)__ -- C interface to mod dimensions of an array
+
+### reorder()
+The __reorder()__ function changes the order of the dimensions within the array. This actually alters the underlying data of the array.
+```
+a [2 2 3 1]
+    1.0000     3.0000
+    2.0000     4.0000
+
+    1.0000     3.0000
+    2.0000     4.0000
+
+    1.0000     3.0000
+    2.0000     4.0000
+
+
+reorder(a, 1, 0, 2) [2 2 3 1]  //equivalent to a transpose
+    1.0000     2.0000
+    3.0000     4.0000
+
+    1.0000     2.0000
+    3.0000     4.0000
+
+    1.0000     2.0000
+    3.0000     4.0000
+
+
+reorder(a, 2, 0, 1) [3 2 2 1]
+    1.0000     2.0000
+    1.0000     2.0000
+    1.0000     2.0000
+
+    3.0000     4.0000
+    3.0000     4.0000
+    3.0000     4.0000
+```
+The reorder function the following several overloads:
+* __array af::reorder(const array &in, const unsigned x, const unsigned y=1, const unsigned z=2, const unsigned w=3)__ -- Reorders dimensions of an array
+
+* __af_err af_reorder(af_array *out, const af_array in, const unsigned x, const unsigned y, const unsigned z, const unsigned w)__ -- C interface for reordering function
+
+### shift()
+The __shift()__ function shifts data in a circular buffer fashion along a chosen dimension.
+```
+a [3 5 1 1]
+    0.0000     0.0000     0.0000     0.0000     0.0000
+    3.0000     4.0000     5.0000     1.0000     2.0000
+    3.0000     4.0000     5.0000     1.0000     2.0000
+
+shift(a, 0, 2 ) [3 5 1 1]
+    0.0000     0.0000     0.0000     0.0000     0.0000
+    1.0000     2.0000     3.0000     4.0000     5.0000
+    1.0000     2.0000     3.0000     4.0000     5.0000
+
+shift(a, -1, 2 ) [3 5 1 1]
+    1.0000     2.0000     3.0000     4.0000     5.0000
+    1.0000     2.0000     3.0000     4.0000     5.0000
+    0.0000     0.0000     0.0000     0.0000     0.0000
+```
+The shift function has the following overloads:
+* __array af::shift(const array &in, const int x, const int y=0, const int z=0, const int w=0)__ -- Shifts array along specified dimensions
+
+* __af_err af_shift(af_array *out, const af_array in, const int x, const int y, const int z, const int w)__ -- C interface for shifting an array
+
+### tile()
+The __tile()__ function repeats an array along a dimension
+```
+a [3 1 1 1]
+    1.0000
+    2.0000
+    3.0000
+
+tile(a, 2) [6 1 1 1]
+    1.0000
+    2.0000
+    3.0000
+    1.0000
+    2.0000
+    3.0000
+
+tile(a, 2, 2) [6 2 1 1]
+    1.0000     1.0000
+    2.0000     2.0000
+    3.0000     3.0000
+    1.0000     1.0000
+    2.0000     2.0000
+    3.0000     3.0000
+
+af::dim4 tile_dims(1, 2, 3);
+tile(a, tile_dims) [3 2 3 1]
+    1.0000     1.0000
+    2.0000     2.0000
+    3.0000     3.0000
+
+    1.0000     1.0000
+    2.0000     2.0000
+    3.0000     3.0000
+
+    1.0000     1.0000
+    2.0000     2.0000
+    3.0000     3.0000
+
+```
+The tile function has several overloads:
+* __array af::tile(const array &in, const unsigned x, const unsigned y=1, const unsigned z=1, const unsigned w=1)__  --  Tiles array along specified dimensions
+* __array af::tile(const array &in, const dim4 &dims)__  --  Tile an array according to a dim4 object
+* __af_err af_tile(af_array *out, const af_array in, const unsigned x, const unsigned y, const unsigned z, const unsigned w)__  --  C interface for tiling an array
+
+### transpose()
+The __transpose()__ function performs a standard matrix transpose. The input array must have the dimensions of a 2D-matrix.
+```
+a [3 3 1 1]
+    1.0000     3.0000     3.0000
+    2.0000     1.0000     3.0000
+    2.0000     2.0000     1.0000
+
+transpose(a) [3 3 1 1]
+    1.0000     2.0000     2.0000
+    3.0000     1.0000     2.0000
+    3.0000     3.0000     1.0000
+
+```
+The transpose function has several overloads:
+* __array af::transpose(const array &in, const bool conjugate=false)__ -- Transposes a matrix.
+
+* __void af::transposeInPlace(array &in, const bool conjugate=false)__ -- Transposes a matrix in-place.
+
+* __af_err af_transpose(af_array *out, af_array in, const bool conjugate)__ -- C interface to transpose a matrix.
+
+* __af_err af_transpose_inplace(af_array in, const bool conjugate)__ -- C interface to transpose a matrix in-place.
 
 [array()](\ref af::array) can be used to create a (shallow) copy of a matrix
 with different dimensions.  The number of elements must remain the same as
@@ -37,3 +279,25 @@ The [T()](\ref af::array::T) and [H()](\ref af::array::H) methods can be
 used to form the [matrix or vector transpose](\ref af::array::T) .
 
 \snippet test/matrix_manipulation.cpp ex_matrix_manipulation_transpose
+
+### Combining re-ordering functions to enumerate grid coordinates
+By using a combination of the array restructuring functions, we can quickly code complex manipulation patterns with a few lines of code. For example, consider generating _(x,y)_ coordinates for a grid where each axis goes from *1 to n*. Instead of using several loops to populate our arrays we can just use a small combination of the above functions.
+```
+unsigned n=3;
+af::array xy = join(1
+                tile(seq(1, n), n)
+                flat( transpose(tile(seq(1, n), 1, n)) )
+                   );
+xy [9 2 1 1]
+    1.0000     1.0000
+    2.0000     1.0000
+    3.0000     1.0000
+    1.0000     2.0000
+    2.0000     2.0000
+    3.0000     2.0000
+    1.0000     3.0000
+    2.0000     3.0000
+    3.0000     3.0000
+```
+### Conclusion
+Functions provided by arrayfire offer ease and flexibility for efficiently manipulating the structure of arrays. The provided functions can be used as building blocks to generate, shift, or prepare data to any form imaginable!
diff --git a/docs/pages/release_notes.md b/docs/pages/release_notes.md
index 6bc53622ca..f1b195b184 100644
--- a/docs/pages/release_notes.md
+++ b/docs/pages/release_notes.md
@@ -1,6 +1,112 @@
 Release Notes {#releasenotes}
 ==============
 
+v3.2.0
+=================
+
+Major Updates
+-------------
+
+* Added Unified backend
+    * Allows switching backends at runtime
+    * Read [Unified Backend](\ref unifiedbackend) for more.
+* Support for 16-bit integers (\ref s16 and \ref u16)
+    * All functions that support 32-bit interger types (\ref s32, \ref u32),
+      now also support 16-bit interger types
+
+Function Additions
+------------------
+* Unified Backend
+    * \ref setBackend() - Sets a backend as active
+    * \ref getBackendCount() - Gets the number of backends available for use
+    * \ref getAvailableBackends() - Returns information about available backends
+    * \ref getBackendId() - Gets the backend enum for an array
+
+* Vision
+    * \ref homography() - Homography estimation
+    * \ref gloh() - GLOH Descriptor for SIFT
+
+* Image Processing
+    * \ref loadImageNative() - Load an image as native data without modification
+    * \ref saveImageNative() - Save an image without modifying data or type
+
+* Graphics
+    * \ref af::Window::plot3() - 3-dimensional line plot
+    * \ref af::Window::surface() - 3-dimensional curve plot
+
+* Indexing
+    * \ref af_create_indexers()
+    * \ref af_set_array_indexer()
+    * \ref af_set_seq_indexer()
+    * \ref af_set_seq_param_indexer()
+    * \ref af_release_indexers()
+
+* CUDA Backend Specific
+    * \ref setNativeId() - Set the CUDA device with given native id as active
+        * ArrayFire uses a modified order for devices. The native id for a
+          device can be retreived using `nvidia-smi`
+
+* OpenCL Backend Specific
+    * \ref setDeviceId() - Set the OpenCL device using the `clDeviceId`
+
+Other Improvements
+------------------------
+* Added \ref c32 and \ref c64 support for \ref isNaN(), \ref isInf() and \ref iszero()
+* Added CPU information for `x86` and `x86_64` architectures in CPU backend's \ref info()
+* Batch support for \ref approx1() and \ref approx2()
+    * Now can be used with gfor as well
+* Added \ref s64 and \ref u64 support to:
+    * \ref sort() (along with sort index and sort by key)
+    * \ref setUnique(), \ref setUnion(), \ref setIntersect()
+    * \ref convolve() and \ref fftConvolve()
+    * \ref histogram() and \ref histEqual()
+    * \ref lookup()
+    * \ref mean()
+* Added \ref AF_MSG macro
+
+Build Improvements
+------------------
+* Submodules update is now automatically called if not cloned recursively
+* [Fixes for compilation](https://github.com/arrayfire/arrayfire/issues/766) on Visual Studio 2015
+* Option to use [fallback to CPU LAPACK](https://github.com/arrayfire/arrayfire/pull/1053)
+  for linear algebra functions in case of CUDA 6.5 or older versions.
+
+Bug Fixes
+--------------
+* Fixed [memory leak](https://github.com/arrayfire/arrayfire/pull/1096) in \ref susan()
+* Fixed [failing test](https://github.com/arrayfire/arrayfire/commit/144a2db)
+  in \ref lower() and \ref upper() for CUDA compute 53
+* Fixed [bug](https://github.com/arrayfire/arrayfire/issues/1092) in CUDA for indexing out of bounds
+* Fixed [dims check](https://github.com/arrayfire/arrayfire/commit/6975da8) in \ref iota()
+* Fixed [out-of-bounds access](https://github.com/arrayfire/arrayfire/commit/7fc3856) in \ref sift()
+* Fixed [memory allocation](https://github.com/arrayfire/arrayfire/commit/5e88e4a) in \ref fast() OpenCL
+* Fixed [memory leak](https://github.com/arrayfire/arrayfire/pull/994) in image I/O functions
+* \ref dog() now returns float-point type arrays
+
+Documentation Updates
+---------------------
+* Improved tutorials documentation
+    * More detailed Using on [Linux](\ref using_on_windows), [OSX](\ref using_on_windows),
+      [Windows](\ref using_on_windows) pages.
+* Added return type information for functions that return different type
+  arrays
+
+New Examples
+------------
+* Graphics
+    * [Plot3](\ref plot3.cpp)
+    * [Surface](\ref surface.cpp)
+* [Shallow Water Equation](\ref swe.cpp)
+* [Basic](\ref basic.cpp) as a Unified backend example
+
+Installers
+-----------
+* All installers now include the Unified backend and corresponding CMake files
+* Visual Studio projects include Unified in the Platform Configurations
+* Added installer for Jetson TX1
+* SIFT and GLOH do not ship with the installers as SIFT is protected by
+  patents that do not allow commercial distribution without licensing.
+
 v3.1.3
 ==============
 
diff --git a/docs/pages/unified_backend.md b/docs/pages/unified_backend.md
new file mode 100644
index 0000000000..96bf94d0a3
--- /dev/null
+++ b/docs/pages/unified_backend.md
@@ -0,0 +1,212 @@
+Unified Backend {#unifiedbackend}
+==========
+
+[TOC]
+
+# Introduction
+
+The Unified backend was introduced in ArrayFire with version 3.2.
+While this is not an independent backend, it allows the user to switch between
+the different ArrayFire backends (CPU, CUDA and OpenCL) at runtime.
+
+# Compiling with Unified
+
+The steps to compile with the unified backend are the same as compiling with
+any of the other backends.
+The only change being that the executable needs to be linked with the __af__
+library (`libaf.so` (Linux), `libaf.dylib` (OSX), `af.lib` (Windows)).
+
+Check the Using with [Linux](\ref using_on_linux), [OSX](\ref using_on_osx),
+[Windows](\ref using_on_windows) for more details.
+
+To use with CMake, use the __ArrayFire_Unified_LIBRARIES__ variable.
+
+# Using the Unified Backend
+
+The Unified backend will try to dynamically load the backend libraries. The
+priority of backends is __CUDA -> OpenCL -> CPU__
+
+The most important aspect to note here is that all the libraries the ArrayFire
+libs depend on need to be in the environment paths
+
+* `LD_LIBRARY_PATH` -> Linux, Unix, OSX
+* `DYLD_LIBRARY_PATH` -> OSX
+* `PATH` -> Windows
+
+If any of the libs are missing, then the library will fail to load and the
+backend will be marked as unavailable.
+
+Optionally, The ArrayFire libs may be present in `AF_PATH` or `AF_BUILD_PATH`
+environment variables if the path is not in the system paths. These are
+treated as fallback paths in case the files are not found in the system paths.
+However, all the other upstream libraries for ArrayFire libs must be present
+in the system path variables shown above.
+
+### Special Mention: CUDA NVVM
+For the CUDA backend, ensure that the CUDA NVVM libs/dlls are in the path.
+These can be easily missed since CUDA installation does not add the paths by default.
+
+On Linux and OSX, add `/usr/local/cuda/nvvm/(lib or lib64)` to LD_LIBRARY_PATH or
+DYLD_LIBRARY_PATH.
+
+On Windows, you can set up a post build event that copys the NVVM dlls to
+the executable directory by using the following commands:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.c}
+echo copy "$(CUDA_PATH)\nvvm\bin\nvvm64*.dll" "$(OutDir)"
+copy "$(CUDA_PATH)\nvvm\bin\nvvm64*.dll" "$(OutDir)"
+if errorlevel 1 (
+    echo "CUDA NVVM DLLs copy failed due to missing files."
+    exit /B 0
+)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This ensures that the NVVM DLLs are copied if present, but does not fail the
+build if the copy fails. This is how ArrayFire ships it's examples.
+
+The other option is to set `%%CUDA_PATH%/nvvm/bin` in the PATH environment
+variable.
+
+# Switching Backends
+
+The af_backend enum stores the possible backends.
+To select a backend, call the af::setBackend function as shown below.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.c}
+af::setBackend(AF_BACKEND_OPENCL);    // Sets CUDA as current backend
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To get the count of the number of backends available (the number of `libaf*`
+backend libraries loaded successfully), call the af::getBackendCount function.
+
+# Example
+
+This example is shortened form of [basic.cpp](\ref basic.cpp).
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.c}
+#include <arrayfire.h>
+
+void testBackend()
+{
+    af::info();
+    af_print(af::randu(5, 4));
+}
+
+int main()
+{
+    try {
+        printf("Trying CPU Backend\n");
+        af::setBackend(AF_BACKEND_CPU);
+        testBackend();
+    } catch (af::exception& e) {
+        printf("Caught exception when trying CPU backend\n");
+        fprintf(stderr, "%s\n", e.what());
+    }
+
+    try {
+        printf("Trying CUDA Backend\n");
+        af::setBackend(AF_BACKEND_CUDA);
+        testBackend();
+    } catch (af::exception& e) {
+        printf("Caught exception when trying CUDA backend\n");
+        fprintf(stderr, "%s\n", e.what());
+    }
+
+    try {
+        printf("Trying OpenCL Backend\n");
+        af::setBackend(AF_BACKEND_OPENCL);
+        testBackend();
+    } catch (af::exception& e) {
+        printf("Caught exception when trying OpenCL backend\n");
+        fprintf(stderr, "%s\n", e.what());
+    }
+
+    return 0;
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This output would be:
+
+    Trying CPU Backend
+    ArrayFire v3.2.0 (CPU, 64-bit Linux, build fc7630f)
+    [0] Intel: Intel(R) Core(TM) i7-4770K CPU @ 3.50GHz Max threads(8)
+    af::randu(5, 4)
+    [5 4 1 1]
+        0.0000     0.2190     0.3835     0.5297
+        0.1315     0.0470     0.5194     0.6711
+        0.7556     0.6789     0.8310     0.0077
+        0.4587     0.6793     0.0346     0.3834
+        0.5328     0.9347     0.0535     0.0668
+
+    Trying CUDA Backend
+    ArrayFire v3.2.0 (CUDA, 64-bit Linux, build fc7630f)
+    Platform: CUDA Toolkit 7.5, Driver: 355.11
+    [0] Quadro K5000, 4093 MB, CUDA Compute 3.0
+    af::randu(5, 4)
+    [5 4 1 1]
+        0.7402     0.4464     0.7762     0.2920
+        0.9210     0.6673     0.2948     0.3194
+        0.0390     0.1099     0.7140     0.8109
+        0.9690     0.4702     0.3585     0.1541
+        0.9251     0.5132     0.6814     0.4452
+
+    Trying OpenCL Backend
+    ArrayFire v3.2.0 (OpenCL, 64-bit Linux, build fc7630f)
+    [0] NVIDIA  : Quadro K5000
+    -1- INTEL   : Intel(R) Core(TM) i7-4770K CPU @ 3.50GHz
+    af::randu(5, 4)
+    [5 4 1 1]
+        0.4107     0.0081     0.6600     0.1046
+        0.8224     0.3775     0.0764     0.8827
+        0.9518     0.3027     0.0901     0.1647
+        0.1794     0.6456     0.5933     0.8060
+        0.4198     0.5591     0.1098     0.5938
+
+# Dos and Don'ts
+
+It is very easy to run into exceptions if you are not careful with the
+switching of backends.
+
+### Don't: Do not use arrays between different backends
+
+ArrayFire checks the input arrays to functions for mismatches with the active
+backend. If an array created on one backend, but used when another backend is
+set to active, an exception with code 503 (`AF_ERR_ARR_BKND_MISMATCH`) is
+thrown.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.c}
+#include <arrayfire.h>
+
+int main()
+{
+    try {
+        af::setBackend(AF_BACKEND_CUDA);
+        af::array A = af::randu(5, 5);
+
+        af::setBackend(AF_BACKEND_OPENCL);
+        af::array B = af::constant(10, 5, 5);
+        af::array C = af::matmul(A, B);     // This will throw an exception
+
+    } catch (af::exception& e) {
+        fprintf(stderr, "%s\n", e.what());
+    }
+
+    return 0;
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+### Do: Use a naming scheme to track arrays and backends
+
+We recommend that you use a technique to track the arrays on the backends. One
+suggested technique would be to use a suffix of `_cpu`, `_cuda`, `_opencl`
+with the array names. So an array created on the CUDA backend would be named
+`myarray_cuda`.
+
+If you have not used the af::setBackend function anywhere in your code, then
+you do not have to worry about this as all the arrays will be created on the
+same default backend.
+
+### Don't: Do not use custom kernels (CUDA/OpenCL) with the Unified backend
+
+This is another area that is a no go when using the Unified backend. It not
+recommended that you use custom kernels with unified backend. This is mainly
+becuase the Unified backend is meant to be ultra portable and should use only
+ArrayFire and native CPU code.
diff --git a/docs/pages/using_on_linux.md b/docs/pages/using_on_linux.md
index 1f8f95e8ca..493080f447 100644
--- a/docs/pages/using_on_linux.md
+++ b/docs/pages/using_on_linux.md
@@ -1,23 +1,33 @@
 Using ArrayFire on Linux {#using_on_linux}
 =====
 
-
+Once you have [installed](\ref installing) ArrayFire on your system, the next thing to do is
+set up your build system. On Linux, you can create ArrayFire projects using
+almost any editor, compiler, or build system. The only requirements are
+that you include the ArrayFire header directories and link with the ArrayFire
+library you intend to use.
+
+## The big picture
+
+On Linux, we suggest you install ArrayFire to the `/usr/local` directory
+so that all of the include files and libraries are part of your standard path.
+The installer will populate files in the following sub-directories:
+
+    include/arrayfire.h         - Primary ArrayFire include file
+    include/af/*.h              - Additional include files
+    lib/libaf*                  - CPU, CUDA, and OpenCL libraries (.a, .so)
+    lib/libforge*               - Visualization library
+    share/ArrayFire/cmake/*     - CMake config (find) scripts
+    share/ArrayFire/examples/*  - All ArrayFire examples
+
+Because ArrayFire follows standard installation practices, you can use basically
+any build system to create and compile projects that use ArrayFire.
 Among the many possible build systems on Linux we suggest using ArrayFire with
-either CMake or Makefiles with CMake being the preferred build system.
-
-## Pre-requisites
-
-Before you get started, make sure you have the necessary pre-requisites.
+either CMake or Makefiles with CMake being our preferred build system.
 
-- If you are using CUDA, please make sure you have [CUDA 7](https://developer.nvidia.com/cuda-downloads) installed on your system.
-     - [Contact us](support@arrayfire.com) for custom builds (eg. different toolkits)
+## Prerequisite software
 
-- If you are using OpenCL, please make sure you have one of the following SDKs.
-     - [AMD OpenCL SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/)
-     - [Intel OpenCL SDK](https://software.intel.com/en-us/articles/download-the-latest-intel-amt-software-development-kit-sdk)
-     - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-
-You will also need the following dependencies to use ArrayFire.
+To build ArrayFire projects you will need a compiler
 
 #### Fedora, Centos and Redhat
 
@@ -28,107 +38,131 @@ yum install epel-release
 yum update
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Install the common dependencies
+Install build dependencies
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 yum install gcc gcc-c++ cmake make
-yum install freeimage
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Install glfw (not required for no-gl installers)
-
-Fedora:
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-yum install glfw
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-For Centos and Redhat, please follow [these instructions](https://github.com/arrayfire/arrayfire/wiki/GLFW-for-ArrayFire)
-
 #### Debian and Ubuntu
 
 Install common dependencies
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 apt-get install build-essential cmake cmake-curses-gui
-apt-get install libfreeimage3
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Install glfw (not required for no-gl installers)
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-apt-get install libglfw3
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-For Debian 7 and Ubuntu 14.04, please follow [these instructions](https://github.com/arrayfire/arrayfire/wiki/GLFW-for-ArrayFire)
+## CMake
 
-**Special instructions for Tegra-K1**
+We recommend that the CMake build system be used to create ArrayFire projects.
+If you are writing a new ArrayFire project in C/C++ from scratch, we suggest
+you grab a copy of our
+[CMake Project Example](https://github.com/arrayfire/arrayfire-project-templates);
+however, it is useful to read the documentation below in case you need to add
+ArrayFire to an existing project.
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-sudo apt-get install libatlas3gf-base libatlas-dev libfftw3-dev liblapacke-dev
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+As [discussed above](#big-picture), ArrayFire ships with a series of CMake
+scripts to make finding and using our library easy.
+The scripts will automatically find all versions of the ArrayFire library
+and pick the most powerful of the installed backends (typically CUDA).
 
-## CMake
+First create a file called `CMakeLists.txt` in your project directory:
 
-This is the suggested method of using ArrayFire on Linux.
-ArrayFire ships with support for CMake by default, including a series of
-`Find` scripts installed  in the `/usr/local/share/ArrayFire/cmake` (or similar)
-directory.
-These scripts will automatically find the CUDA, OpenCL, and CPU versions
-of ArrayFire and automatically choose the most powerful installed backend
-(typically CUDA).
+    cd your-project-directory
+    touch CMakeLists.txt
 
-To use ArrayFire, simply insert the `FIND_PACKAGE` command inside of your
-`CMakeLists.txt` file as follows:
+and populate it with the following code:
 
     FIND_PACKAGE(ArrayFire)
     INCLUDE_DIRECTORIES(${ArrayFire_INCLUDE_DIRS})
-    ...
 
-    ADD_EXECUTABLE(some_executable ...)
-    TARGET_LINK_LIBRARIES(some_executable ${ArrayFire_LIBRARIES} )
+    ... [gather source files, etc.]
 
-The find script will automatically define several variables including:
+    # If you intend to use OpenCL, you need to find it
+    FIND_PACKAGE(OpenCL)
+    SET(EXTRA_LIBS ${CMAKE_THREAD_LIBS_INIT} ${OpenCL_LIBRARIES})
 
-    ArrayFire_INCLUDE_DIRS    - Location of ArrayFire's include directory.
-    ArrayFire_LIBRARIES       - Location of ArrayFire's libraries. This will default
-                                to a GPU backend if one
-    ArrayFire_FOUND           - True if ArrayFire has been located
+    # Or if you intend to use CUDA, you need it as well as NVVM:
+    FIND_PACKAGE(CUDA)
+    FIND_PACKAGE(NVVM) # this FIND script can be found in the ArrayFire CMake example repository
+    SET(EXTRA_LIBS ${CMAKE_THREAD_LIBS_INIT} ${CUDA_LIBRARIES} ${NVVM_LIB})
 
-If you wish to use a specific backend, the find script also defines these variables:
+    ADD_EXECUTABLE(my_executable [list your source files here])
+    TARGET_LINK_LIBRARIES(my_executable ${ArrayFire_LIBRARIES} ${EXTRA_LIBS})
 
-    ArrayFire_CPU_FOUND        - True of the ArrayFire CPU library has been found.
-    ArrayFire_CPU_LIBRARIES    - Location of ArrayFire's CPU library, if found
-    ArrayFire_CUDA_FOUND       - True of the ArrayFire CUDA library has been found.
-    ArrayFire_CUDA_LIBRARIES   - Location of ArrayFire's CUDA library, if found
-    ArrayFire_OpenCL_FOUND     - True of the ArrayFire OpenCL library has been found.
-    ArrayFire_OpenCL_LIBRARIES - Location of ArrayFire's OpenCL library, if found
+where `my_executable` is the name of the executable you wish to create.
+See the [CMake documentation](https://cmake.org/documentation/) for more
+information on how to use CMake.
+Clearly the above code snippet precludes the use of both CUDA and OpenCL, see
+the
+[ArrayFire CMake Example](https://github.com/arrayfire/arrayfire-project-templates/tree/master/CMake);
+for an example of how to build executables for both backends from the same
+CMake script.
 
-Therefore, if you wish to target a specific specific backend, switch
-`${ArrayFire_LIBRARIES}` to `${ArrayFire_CPU}` `${ArrayFire_OPENCL}` or
-`${ArrayFire_CUDA}` in the `TARGET_LINK_LIBRARIES` command above.
+In the above code listing, the `FIND_PACKAGE` will find the ArrayFire include
+files, libraries, and define several variables including:
 
-Finally, if you have installed ArrayFire to a non-standard location, CMake can still help
-you out. When you execute CMake specify the path to the `ArrayFireConfig*` files that
-are found in the `share/ArrayFire/cmake` subdirectory of the installation folder.
-For example, if ArrayFire were installed locally to `/opt/ArrayFire` then you would
-modify the `cmake` command above to contain the following definition:
+    ArrayFire_INCLUDE_DIRS    - Location of ArrayFire's include directory.
+    ArrayFire_LIBRARIES       - Location of ArrayFire's libraries.
+                                This will default to a GPU backend if one
+                                is found
+    ArrayFire_FOUND           - True if ArrayFire has been located
+
+If you wish to use a specific backend, the find script also defines these variables:
 
-```
-cmake -DArrayFire_DIR=/opt/ArrayFire/share/ArrayFire/cmake ...
-```
+    ArrayFire_CPU_FOUND         - True of the ArrayFire CPU library has been found.
+    ArrayFire_CPU_LIBRARIES     - Location of ArrayFire's CPU library, if found
+    ArrayFire_CUDA_FOUND        - True of the ArrayFire CUDA library has been found.
+    ArrayFire_CUDA_LIBRARIES    - Location of ArrayFire's CUDA library, if found
+    ArrayFire_OpenCL_FOUND      - True of the ArrayFire OpenCL library has been found.
+    ArrayFire_OpenCL_LIBRARIES  - Location of ArrayFire's OpenCL library, if found
+    ArrayFire_Unified_FOUND     - True of the ArrayFire Unified library has been found.
+    ArrayFire_Unified_LIBRARIES - Location of ArrayFire's Unified library, if found
+
+Therefore, if you wish to target a specific specific backend, simply replace
+`${ArrayFire_LIBRARIES}` with `${ArrayFire_CPU}`, `${ArrayFire_OPENCL}`,
+`${ArrayFire_CUDA}`, or `${ArrayFire_Unified}` in the `TARGET_LINK_LIBRARIES`
+command above.
+If you intend on building your software to link with all of these backends,
+please see the
+[CMake Project Example](https://github.com/arrayfire/arrayfire-project-templates)
+which makes use of some fairly fun CMake tricks to avoid re-compiling code
+whenever possible.
+
+Next we need to instruct CMake to create build instructions and then compile.
+We suggest using CMake's out-of-source build functionality to keep your build
+and source files cleanly separated. To do this:
+
+    cd your-project-directory
+    mkdir build
+    cd build
+    cmake ..
+    make
+
+*NOTE:* If you have installed ArrayFire to a non-standard location, CMake can
+still help you out. When you execute CMake specify the path to the
+`ArrayFireConfig*` files that are found in the `share/ArrayFire/cmake`
+subdirectory of the installation folder.
+For example, if ArrayFire were installed locally to `/opt/ArrayFire` then you
+would modify the `cmake` command above to contain the following definition:
+
+    cmake -DArrayFire_DIR=/opt/ArrayFire/share/ArrayFire/cmake ..
+
+You can also specify this information in the ccmake command-line interface.
 
 ## MakeFiles
 
-Using ArrayFire with Makefiles is almost as easy as CMake, but you will
-need to specify paths manually. In your makefile specify the include path to
-the directory containing `arrayfire.h`. Typically this will be `-I /usr/include`
-or `-I /usr/local/include` if you installed ArrayFire using our installation
+Building ArrayFire projects with Makefiles is fairly similar to CMake except
+you must specify all paths and libraries manually.
+As with any make project, you need to specify the include path to the
+directory containing `arrayfire.h` file.
+This should be `-I /usr/local/include` if you followed our installation
 instructions.
-Then, in your linker line specify the path to ArrayFire using the `-L` option
-(typically `-L/usr/lib` or `-L/usr/local/lib` and the specific ArrayFire backend
-you wish to use with the `-l` option (i.e. `-lafcpu`, `-lafopencl` or `-lafcuda`
-for the CPU, OpenCL and CUDA backends repsectively).
+Similarly, you will need to specify the path to the ArrayFire library using
+the `-L` option (e.g. `-L/usr/local/lib`) followed by the specific ArrayFire
+library you wish to use using the `-l` option (for example `-lafcpu`,
+`-lafopencl`, `-lafcuda`, or `-laf` for the CPU, OpenCL, CUDA, and unified
+backends respectively.
 
 Here is a minimial example MakeFile which uses ArrayFire's CPU backend:
 
diff --git a/docs/pages/using_on_osx.md b/docs/pages/using_on_osx.md
index 0baa8c94e1..ccb0fb523a 100644
--- a/docs/pages/using_on_osx.md
+++ b/docs/pages/using_on_osx.md
@@ -1,82 +1,211 @@
 Using ArrayFire on OSX {#using_on_osx}
 =====
 
+Once you have [installed](\ref installing) ArrayFire on your system, the next
+thing to do is set up your build system.
+On OSX, you may create ArrayFire project using almost any editor, compiler,
+or build system.
+The only requirement is that you can include the ArrayFire header directory,
+and link with the ArrayFire library you intend to use.
 
-Among the many possible build systems on OSX we suggest using ArrayFire with
-either CMake or Makefiles.
+## The big picture
 
-## Pre-requisites
+By default, the ArrayFire OSX installer will place several files in your
+computer's `/usr/local` directory.
+The installer will populate this directory with files in the following
+sub-directories:
 
-Before you get started, make sure you have the necessary pre-requisites.
+    include/arrayfire.h         - Primary ArrayFire include file
+    include/af/*.h              - Additional include files
+    lib/libaf*                  - CPU, CUDA, and OpenCL libraries (.a, .so)
+    lib/libforge*               - Visualization library
+    share/ArrayFire/cmake/*     - CMake config (find) scripts
+    share/ArrayFire/examples/*  - All ArrayFire examples
 
-- If you want to use ArrayFire with CUDA, please make sure you have [CUDA 7](https://developer.nvidia.com/cuda-downloads) installed on your system.
-     - [Contact us](support@arrayfire.com) for custom builds (eg. different toolkits)
+Because ArrayFire follows standard installation practices, you can use basically
+any build system to create and compile projects that use ArrayFire.
+Among the many possible build systems on Linux we suggest using ArrayFire with
+either CMake or Makefiles with CMake being our preferred build system.
 
-- Install the latest Xcode from the App Store
+## XCode
 
-- Install [brew](http://brew.sh/)
+Although we recommend using CMake to build ArrayFire projects on OSX, you can
+use XCode if this is your preferred development platform.
+To save some time, we have created an sample XCode project in our 
+[ArrayFire Project Templates repository](https://github.com/arrayfire/arrayfire-project-templates).
+
+To set up a basic C/C++ project in XCode do the following:
+
+1. Start up XCode. Choose OSX -> Application, Command Line Tool for the project:
+<img src="xcode-setup/xcode-startup.png" alt="Create a command line too XCode Project" width="100%" />
+
+2. Fill in the details for your project and choose either C or C++ for the project:
+<img src="xcode-setup/project-options.png" alt="Create a C/C++ project" width="100%" />
+
+3. Next we need to configure the build settings. In the left-hand pane, click
+   on the project. In the center pane, click on "Build Settings" followed by
+   the "All" button:
+<img src="xcode-setup/build-settings.png" alt="Configure build settings" width="100%" />
+
+4. Now search for "Header Search Paths" and add `/usr/local/include` to the list:
+<img src="xcode-setup/header-search-paths.png" alt="Configure build settings" width="100%" />
+
+5. Then search for "Library Search Paths" and add `/usr/local/lib` to the list:
+<img src="xcode-setup/library-search-paths.png" alt="Configure build settings" width="100%" />
+
+6. Next, we need to make sure the executable is linked with an ArrayFire library:
+   To do this, click the "Build Phases" tab and expand the "Link with Binary Library"
+   menu:
+<img src="xcode-setup/build-phases.png" alt="Configure build settings" width="100%" />
+
+7. In the search dialog that pops up, choose the "Add Other" button from the
+   lower right. Specify the `/usr/local/lib` folder:
+<img src="xcode-setup/library-folder-path.png" alt="Configure build settings" width="100%" />
+
+8. Lastly, select the ArrayFire library with which you wish to link your program.
+  Your options will be:
+
+~~~~~
+libafcuda.*.dylib   - CUDA backend
+libafopencl.*.dylib - OpenCL backend
+libafcpu.*.dylib    - CPU backend
+libaf.*.dylib       - Unified backend
+~~~~~
+
+In the picture below, we have elected to link with the OpenCL backend:
+
+<img src="xcode-setup/pick-arrayfire-library.png" alt="Configure build settings" width="100%" />
+
+9. Lastly, lets test ArrayFire's functionality. In the left hand pane open
+   the main.cpp` file and insert the following code:
+
+~~~~~
+// Include the ArrayFire header file
+#include <arrayfire.h>
+
+int main(int argc, const char * argv[]) {
+    // Gather some information about the ArrayFire device
+    af::info();
+    return 0;
+}
+~~~~~
+
+Finally, click the build button and you should see some information about your
+graphics card in the lower-section of your screen:
+
+<img src="xcode-setup/afinfo-result.png" alt="Configure build settings" width="100%" />
 
 ## CMake
 
-This is the suggested method of using ArrayFire on OSX.
-ArrayFire ships with support for CMake by default, including a series of
-`Find` scripts installed  in the `/usr/local/share/ArrayFire/cmake` (or similar)
-directory.
-These scripts will automatically find the CUDA, OpenCL, and CPU versions
-of ArrayFire and automatically choose the most powerful installed backend
-(typically CUDA).
+We recommend that the CMake build system be used to create ArrayFire projects.
+If you are writing a new ArrayFire project in C/C++ from scratch, we suggest
+you grab a copy of our
+[CMake Project Example](https://github.com/arrayfire/arrayfire-project-templates);
+however, it is useful to read the documentation below in case you need to add
+ArrayFire to an existing project.
+
+As [discussed above](#big-picture), ArrayFire ships with a series of CMake
+scripts to make finding and using our library easy.
+The scripts will automatically find all versions of the ArrayFire library
+and pick the most powerful of the installed backends (typically CUDA).
 
-To use ArrayFire, simply insert the `FIND_PACKAGE` command inside of your
-`CMakeLists.txt` file as follows:
+First create a file called `CMakeLists.txt` in your project directory:
+
+    cd your-project-directory
+    touch CMakeLists.txt
+
+and populate it with the following code:
 
     FIND_PACKAGE(ArrayFire)
     INCLUDE_DIRECTORIES(${ArrayFire_INCLUDE_DIRS})
-    ...
 
-    ADD_EXECUTABLE(some_executable ...)
-    TARGET_LINK_LIBRARIES(some_executable ${ArrayFire_LIBRARIES} )
+    ... [gather source files, etc.]
 
-The find script will automatically define several variables including:
+    # If you intend to use OpenCL, you need to find it
+    FIND_PACKAGE(OpenCL)
+    SET(EXTRA_LIBS ${CMAKE_THREAD_LIBS_INIT} ${OpenCL_LIBRARIES})
 
-    ArrayFire_INCLUDE_DIRS    - Location of ArrayFire's include directory.
-    ArrayFire_LIBRARIES       - Location of ArrayFire's libraries. This will default
-                                to a GPU backend if one
-    ArrayFire_FOUND           - True if ArrayFire has been located
+    # Or if you intend to use CUDA, you need it as well as NVVM:
+    FIND_PACKAGE(CUDA)
+    FIND_PACKAGE(NVVM) # this FIND script can be found in the ArrayFire CMake example repository
+    SET(EXTRA_LIBS ${CMAKE_THREAD_LIBS_INIT} ${CUDA_LIBRARIES} ${NVVM_LIB})
 
-If you wish to use a specific backend, the find script also defines these variables:
+    ADD_EXECUTABLE(my_executable [list your source files here])
+    TARGET_LINK_LIBRARIES(my_executable ${ArrayFire_LIBRARIES} ${EXTRA_LIBS})
+
+where `my_executable` is the name of the executable you wish to create.
+See the [CMake documentation](https://cmake.org/documentation/) for more
+information on how to use CMake.
+Clearly the above code snippet precludes the use of both CUDA and OpenCL, see
+the
+[ArrayFire CMake Example](https://github.com/bkloppenborg/arrayfire-cmake-example)
+for an example of how to build executables for both backends from the same
+CMake script.
 
-    ArrayFire_CPU_FOUND        - True of the ArrayFire CPU library has been found.
-    ArrayFire_CPU_LIBRARIES    - Location of ArrayFire's CPU library, if found
-    ArrayFire_CUDA_FOUND       - True of the ArrayFire CUDA library has been found.
-    ArrayFire_CUDA_LIBRARIES   - Location of ArrayFire's CUDA library, if found
-    ArrayFire_OpenCL_FOUND     - True of the ArrayFire OpenCL library has been found.
-    ArrayFire_OpenCL_LIBRARIES - Location of ArrayFire's OpenCL library, if found
+In the above code listing, the `FIND_PACKAGE` will find the ArrayFire include
+files, libraries, and define several variables including:
 
-Therefore, if you wish to target a specific specific backend, switch
-`${ArrayFire_LIBRARIES}` to `${ArrayFire_CPU}` `${ArrayFire_OPENCL}` or
-`${ArrayFire_CUDA}` in the `TARGET_LINK_LIBRARIES` command above.
+    ArrayFire_INCLUDE_DIRS    - Location of ArrayFire's include directory.
+    ArrayFire_LIBRARIES       - Location of ArrayFire's libraries.
+                                This will default to a GPU backend if one
+                                is found
+    ArrayFire_FOUND           - True if ArrayFire has been located
 
-Finally, if you have installed ArrayFire to a non-standard location, CMake can still help
-you out. When you execute CMake specify the path to the `ArrayFireConfig*` files that
-are found in the `share/ArrayFire/cmake` subdirectory of the installation folder.
-For example, if ArrayFire were installed locally to `/opt/ArrayFire` then you would
-modify the `cmake` command above to contain the following definition:
+If you wish to use a specific backend, the find script also defines these variables:
 
-```
-cmake -DArrayFire_DIR=/opt/ArrayFire/share/ArrayFire/cmake ...
-```
+    ArrayFire_CPU_FOUND         - True of the ArrayFire CPU library has been found.
+    ArrayFire_CPU_LIBRARIES     - Location of ArrayFire's CPU library, if found
+    ArrayFire_CUDA_FOUND        - True of the ArrayFire CUDA library has been found.
+    ArrayFire_CUDA_LIBRARIES    - Location of ArrayFire's CUDA library, if found
+    ArrayFire_OpenCL_FOUND      - True of the ArrayFire OpenCL library has been found.
+    ArrayFire_OpenCL_LIBRARIES  - Location of ArrayFire's OpenCL library, if found
+    ArrayFire_Unified_FOUND     - True of the ArrayFire Unified library has been found.
+    ArrayFire_Unified_LIBRARIES - Location of ArrayFire's Unified library, if found
+
+Therefore, if you wish to target a specific specific backend, simply replace
+`${ArrayFire_LIBRARIES}` with `${ArrayFire_CPU}`, `${ArrayFire_OPENCL}`,
+`${ArrayFire_CUDA}`, or `${ArrayFire_Unified}` in the `TARGET_LINK_LIBRARIES`
+command above.
+If you intend on building your software to link with all of these backends,
+please see the
+[CMake Project Example](https://github.com/arrayfire/arrayfire-project-templates)
+which makes use of some fairly fun CMake tricks to avoid re-compiling code
+whenever possible.
+
+Next we need to instruct CMake to create build instructions and then compile.
+We suggest using CMake's out-of-source build functionality to keep your build
+and source files cleanly separated. To do this:
+
+    cd your-project-directory
+    mkdir build
+    cd build
+    cmake ..
+    make
+
+*NOTE:* If you have installed ArrayFire to a non-standard location, CMake can
+still help you out. When you execute CMake specify the path to the
+`ArrayFireConfig*` files that are found in the `share/ArrayFire/cmake`
+subdirectory of the installation folder.
+For example, if ArrayFire were installed locally to `/opt/ArrayFire` then you
+would modify the `cmake` command above to contain the following definition:
+
+    cmake -DArrayFire_DIR=/opt/ArrayFire/share/ArrayFire/cmake ..
+
+You can also specify this information in the ccmake command-line interface.
 
 ## MakeFiles
 
-Using ArrayFire with Makefiles is almost as easy as CMake, but you will
-need to specify paths manually. In your makefile specify the include path to
-the directory containing `arrayfire.h`. Typically this will be `-I /usr/include`
-or `-I /usr/local/include` if you installed ArrayFire using our installation
+Building ArrayFire projects with Makefiles is fairly similar to CMake except
+you must specify all paths and libraries manually.
+As with any make project, you need to specify the include path to the
+directory containing `arrayfire.h` file.
+This should be `-I /usr/local/include` if you followed our installation
 instructions.
-Then, in your linker line specify the path to ArrayFire using the `-L` option
-(typically `-L/usr/lib` or `-L/usr/local/lib` and the specific ArrayFire backend
-you wish to use with the `-l` option (i.e. `-lafcpu`, `-lafopencl` or `-lafcuda`
-for the CPU, OpenCL and CUDA backends repsectively).
+Similarly, you will need to specify the path to the ArrayFire library using
+the `-L` option (e.g. `-L/usr/local/lib`) followed by the specific ArrayFire
+library you wish to use using the `-l` option (for example `-lafcpu`,
+`-lafopencl`, `-lafcuda`, or `-laf` for the CPU, OpenCL, CUDA, and unified
+backends respectively.
 
 Here is a minimial example MakeFile which uses ArrayFire's CPU backend:
 
diff --git a/docs/pages/using_on_windows.md b/docs/pages/using_on_windows.md
index aa4aeff2d0..92c7c2db92 100644
--- a/docs/pages/using_on_windows.md
+++ b/docs/pages/using_on_windows.md
@@ -1,108 +1,224 @@
 Using ArrayFire with Microsoft Windows and Visual Studio {#using_on_windows}
 =====
 
-## Pre-requisites
-
-Before you get started, make sure you have the necessary pre-requisites.
-
-- If you are using CUDA, please make sure you have [CUDA 7](https://developer.nvidia.com/cuda-downloads) installed on your system.
-     - [Contact us](support@arrayfire.com) for custom builds (eg. different toolkits)
-
-- If you are using OpenCL, please make sure you have one of the following SDKs.
-     - [AMD OpenCL SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/)
-     - [Intel OpenCL SDK](https://software.intel.com/en-us/articles/download-the-latest-intel-amt-software-development-kit-sdk)
-     - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-
-## Step 0: Running pre-built executables
+If you have not already done so, please make sure you have installed,
+configured, and tested ArrayFire following the
+[installation instructions](\ref installing).
+
+## The big picture
+The ArrayFire Windows installer creates the following:
+1. `AF_PATH` environment variable to point to the installation location. The
+   default install location is `C:\Program Files\ArrayFire\v3`
+2. `AF_PATH/include`         : Header files for ArrayFire (include directory)
+3. `AF_PATH/lib`             : All ArrayFire backends libraries, dlls and dependency dlls (library directory)
+4. `AF_PATH/examples`        : Examples to get started. Some examples also have pre-built exectuables
+5. `AF_PATH/cmake`           : CMake config files for automatic configuration by external projects
+6. `AF_PATH/uninstall.exe`   : Uninstaller
+7. `AF_PATH/*`               : Other miscellenous files including licenses, logos, copyrights
+
+The installer also appends `%%AF_PATH%/lib` to the User PATH variable.
+
+To add `%%AF_PATH%/lib` to PATH for all users see the windows section in
+[installation instructions](\ref installing).
+
+### <a name="nvvm_dlls" />Dealing with CUDA NMMV DLLs
+When using CUDA with ArrayFire you may encounter a linker error indicating the
+NVVM DLLs are missing. This is because the NVVM DLLs are not part of the
+standard `CUDA_PATH\bin` installation directory that is added to your `PATH`
+when the CUDA installer runs. Thus, NVVM will not be found during runtime. There
+are a few ways to deal with this issue:
+
+1. Copy the DLLs to the exectuable location. This is, by far, the cleanest
+   solution and we recommend doing this with ArrayFire projects. To do so,
+   create a post-build event to copy the NVVM DLL as discusses below in
+   [Step 3 - Part A](#s3partA).
+2. Copy `CUDA_PATH\nvvm\bin\nvvm64_30_0.dll` to `CUDA_PATH\bin`. This is a one time
+   copy such that the NVVM DLL is now with all the other CUDA dlls and in a
+   directory that is a part of PATH and hence the DLL can be detected automatically.
+3. Add `%%CUDA_PATH%\nvvm\bin` to the system PATH environment variable.
+   This will allow automatic detection by the system and No further copying will
+   be required. ArrayFire does not add this to PATH since the CUDA installer
+   doesn't add it to PATH.
+
+## <a name="step1" />Step 1: Running pre-built executables
 
 The ArrayFire installer ships with a few pre-built executables with the examples.
-These should run out of the box.
-
-Note: For the CUDA executables, you will need to copy CUDA_PATH\nvvm\bin\nvvm64_30_0.dll
-to the location of the executables.
-
-## Step 1: Adding ArrayFire to PATH for all users
-
-The ArrayFire installer for Windows creates a user `PATH` variable containing
-`%%AF_PATH%/lib`. This is required so that Windows knows where to find the
-ArrayFire DLLs. This variable fixes the DLL finding only for the user that
-installs ArrayFire.
-
-To allow DLL detection for all users, it needs to be added to the system
-`PATH` variable. For this, follow the steps:
-
-1. Open Advanced System Settings:
-  * Windows 8: Move the Mouse pointer to the bottom right corner of the screen,
-    Right click, choose System. Then click "Advanced System Settings"
-  * Windows 7: Open the Start Menu and Right Click on "Computer". Then choose
-    Properties and click "Advanced System Settings"
-
-2. In _Advanced System Settings_ window, click on _Advanced_ tab
+These should run out of the box when double clicked.
 
-3. Click on _Environment Variables_, then under **System Variables**, find
-   `PATH`, and click on it.
+Some prebuilt examples are:
+* Helloworld (examples/helloworld)
+* BLAS (examples/benchmarks)
+* FFT (examples/benchmarks)
+* Pi Estimation (examples/benchmarks)
+* Conway (Graphics) (examples/graphics)
 
-4. In edit mode, append `%%AF_PATH%/lib`. NOTE: Ensure that there is a semi-colon
-   separating `%%AF_PATH%/lib` from any existing content (e.g.
-   `EXISTING_PATHS;%%AF_PATH%/lib;`) otherwise other software may not function
-   correctly.
+Note: For the CUDA executables, you will need to copy `CUDA_PATH\nvvm\bin\nvvm64_30_0.dll`
+to the location of the executables.
 
-## Step 2: Verify the path addition functions correctly
+## <a name="step2" />Step 2: Build and Run a Project
 
-1. Open Visual Studio 2013. Open the HelloWorld solution which is located at
+1. Open Visual Studio 2013. Load the HelloWorld solution which is located at
    `AF_PATH/examples/helloworld/helloworld.sln`.
-2. Build and run the `helloworld` example. Be sure to, select the
-   platform/configuration of your choice using the platform drop-down
-   (the options are CPU, CUDA, and OpenCL) and Solution Configuration drop down
-   (options of Release and Debug) menus.
-3. Run the `helloworld` example
-
-## Step 3: Creating your own Visual Studio Project
-
-### A new project from scratch
-
-If you are creating a new project which is intended to be platform-independent,
-the best option is to simply copy the existing `helloworld` solution files
-and modify them to suit your needs. This will retain all the platform based
-settings that have been configured in the examples.
-
-### Adding ArrayFire CPU/OpenCL to a new/existing project
-
-If you are adding ArrayFire to a new or existing project that will contain
-custom CPU or OpenCL kernels, you only need to make a few modifications to
-your project soultion:
+2. Build the `helloworld` example. Be sure to, select the platform/configuration
+   of your choice using the platform drop-down (the options are CPU, CUDA,
+   OpenCL, and Unified) and Solution Configuration drop down (options of Release
+   and Debug) menus.
+3. Run the `helloworld` example.
+
+## <a name="step3" />Step 3: Using ArrayFire within Visual Studio
+This is divided into 4 parts:
+* [Part A: Adding ArrayFire to an existing solution (Single Backend)](#s3partA)
+* [Part B: Adding ArrayFire CUDA to a new/existing CUDA project](#s3partB)
+* [Part C: Project with all ArrayFire backends](#s3partC)
+* [Part D: ArrayFire with CMake](#s3partD)
+
+### <a name="s3partA" />Part A: Adding ArrayFire to an existing solution (Single Backend)
+Note: If you plan on using Native CUDA code in the project, use the steps
+under [Part B](#s3partB).
+
+Adding a single backend to an existing project is quite simple.
+
+1. Add `"$(AF_PATH)/include;"` to
+   _Project Properties -> C/C++ -> General -> Additional Include Directories_.
+2. Add `"$(AF_PATH)/lib;"` to
+   _Project Properties -> Linker -> General -> Additional Library Directories_.
+3. Add `afcpu.lib` or `afcuda.lib` or `afopencl.lib` to
+   _Project Properties -> Linker -> Input -> Additional Dependencies_.
+   based on your preferred backend.
+4. (Optional) You may choose to define `NOMINMAX`, `AF_<CPU/CUDA/OPENCL>`
+   and/or `AF_<DEBUG/RELEASE>` in your projects. This can be added to
+   _Project Properties -> C/C++ -> General -> Preprocessor-> Preprocessory definitions_.
+
+If you are using the CUDA backend, it is important to ensure that the CUDA NVVM
+DLLs are copied to the exectuable directory. This can be done by adding a post
+build event.
+
+Open the _Project Properties -> Build Events -> Post Build Events_ dialog and
+add the following lines to it.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.c}
+echo copy "$(CUDA_PATH)\nvvm\bin\nvvm64*.dll" "$(OutDir)"
+copy "$(CUDA_PATH)\nvvm\bin\nvvm64*.dll" "$(OutDir)"
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+### <a name="s3partB" />Part B: Adding ArrayFire CUDA to a new/existing CUDA project
+Lastly, if your project contains custom CUDA code, the instructions are slightly
+different as it requires using a CUDA NVCC Project:
 
-1. Open an existing project or create a new "Empty C/C++ project in Visual Studio"
-2. Add `$(AF_PATH)/include;` to
-   _Project Properties -> C/C++ -> General -> Additional Include Directories_
-3. Add `$(AF_PATH)/lib;` to
-  _Project Properties -> Linker -> General -> Additional Library Directories_
+1. Create a custom "CUDA NVCC project" in Visual Studio
+2. Add `"$(AF_PATH)/include;"` to
+   _Project Properties -> CUDA C/C++ -> General -> Additional Include Directories_.
+3. Add `"$(AF_PATH)/lib;"` to
+   _Project Properties -> Linker -> General -> Additional Library Directories_.
 4. Add `afcpu.lib` or `afcuda.lib` or `afopencl.lib` to
-  _Project Properties -> Linker -> Input -> Additional Dependencies_
-  based on your preferred backend.
-5. (Optional) You make choose to define `NOMINMAX`, `AF_<CPU/CUDA/OPENCL>`
-  and/or `AF_<DEBUG/RELEASE>` in your projects. This can be added to
-  _Project Properties -> C/C++ -> General -> Preprocessor-> Preprocessory definitions_.
+   _Project Properties -> Linker -> Input -> Additional Dependencies_.
+   based on your preferred backend.
+5. (Optional) You may choose to define `NOMINMAX`, `AF_CUDA`
+   and/or `AF_<DEBUG/RELEASE>` in your projects. This can be added to
+   _Project Properties -> C/C++ -> General -> Preprocessor-> Preprocessory definitions_.
+6. Pick a solution to handle the NVVM DLLs. We recommend the post build event
+   method used in [Part A](#s3partA).
+
+### <a name="s3partC" />Part C: Project with all ArrayFire backends
+If you wish to create a project that allows you to use all the ArrayFire
+backends with ease, the best way to go is to copy the *HelloWorld sln/vcxproj/cpp*
+file trio and rename them to suit your project.
+
+All the ArrayFire examples are pre-configured for all ArrayFire backends as well
+as the Unified API. These can be chosen from the Solution/Platform configuration
+drop down boxes.
+
+You can alternately download the template project from
+[ArrayFire Template Projects](https://github.com/arrayfire/arrayfire-project-templates)
+
+### <a name="s3partD" />Part D: ArrayFire with CMake
+*NOTE:* The ArrayFire installer sets up CMake file and registry so that it can be found
+by CMake by simply using the `Find_PACKAGE(ArrayFire)` command.
+
+If you are writing a new ArrayFire project in C/C++ from scratch, we suggest
+you grab a copy of our
+[CMake Project Example](https://github.com/arrayfire/arrayfire-project-templates);
+however, it is useful to read the documentation below in case you need to add
+ArrayFire to an existing project.
+
+As [discussed above](#big-picture), ArrayFire ships with a series of CMake
+scripts to make finding and using our library easy.
+The scripts will automatically find all versions of the ArrayFire library
+and pick the most powerful of the installed backends (typically CUDA).
+
+First create a file called `CMakeLists.txt` in your project directory:
+
+    cd your-project-directory
+    touch CMakeLists.txt
+
+and populate it with the following code:
+
+    FIND_PACKAGE(ArrayFire)
+    INCLUDE_DIRECTORIES(${ArrayFire_INCLUDE_DIRS})
+
+    ... [gather source files, etc.]
+
+    # If you intend to use OpenCL, you need to find it
+    FIND_PACKAGE(OpenCL)
+    SET(EXTRA_LIBS ${CMAKE_THREAD_LIBS_INIT} ${OpenCL_LIBRARIES})
+
+    # Or if you intend to use CUDA, you need it as well as NVVM:
+    FIND_PACKAGE(CUDA)
+    FIND_PACKAGE(NVVM) # this FIND script can be found in the ArrayFire CMake example repository
+    SET(EXTRA_LIBS ${CMAKE_THREAD_LIBS_INIT} ${CUDA_LIBRARIES} ${NVVM_LIB})
+
+    ADD_EXECUTABLE(my_executable [list your source files here])
+    TARGET_LINK_LIBRARIES(my_executable ${ArrayFire_LIBRARIES} ${EXTRA_LIBS})
+
+where `my_executable` is the name of the executable you wish to create.
+See the [CMake documentation](https://cmake.org/documentation/) for more
+information on how to use CMake.
+Clearly the above code snippet precludes the use of both CUDA and OpenCL, see
+the
+[ArrayFire CMake Example](https://github.com/arrayfire/arrayfire-project-templates/tree/master/CMake)
+for an example of how to build executables for both backends from the same
+CMake script.
+
+In the above code listing, the `FIND_PACKAGE` will find the ArrayFire include
+files, libraries, and define several variables including:
+
+    ArrayFire_INCLUDE_DIRS    - Location of ArrayFire's include directory.
+    ArrayFire_LIBRARIES       - Location of ArrayFire's libraries.
+                                This will default to a GPU backend if one
+                                is found
+    ArrayFire_FOUND           - True if ArrayFire has been located
+
+If you wish to use a specific backend, the find script also defines these variables:
+
+    ArrayFire_CPU_FOUND         - True of the ArrayFire CPU library has been found.
+    ArrayFire_CPU_LIBRARIES     - Location of ArrayFire's CPU library, if found
+    ArrayFire_CUDA_FOUND        - True of the ArrayFire CUDA library has been found.
+    ArrayFire_CUDA_LIBRARIES    - Location of ArrayFire's CUDA library, if found
+    ArrayFire_OpenCL_FOUND      - True of the ArrayFire OpenCL library has been found.
+    ArrayFire_OpenCL_LIBRARIES  - Location of ArrayFire's OpenCL library, if found
+    ArrayFire_Unified_FOUND     - True of the ArrayFire Unified library has been found.
+    ArrayFire_Unified_LIBRARIES - Location of ArrayFire's Unified library, if found
+
+Therefore, if you wish to target a specific specific backend, simply replace
+`${ArrayFire_LIBRARIES}` with `${ArrayFire_CPU}`, `${ArrayFire_OPENCL}`,
+`${ArrayFire_CUDA}`, or `${ArrayFire_Unified}` in the `TARGET_LINK_LIBRARIES`
+command above.
+
+Next we need to instruct CMake to create build instructions and then compile.
+We suggest using CMake's out-of-source build functionality to keep your build
+and source files cleanly separated. To do this open the CMake GUI.
+
+* Under source directory, add the path to your project
+* Under build directory, add the path to your project and append /build
+* Click configure and choose Visual Studio 2013 Win 64 as the generator.
+* If configuration was successful, click generate. This will create a
+  my-project.sln file under build. You can open this in Visual Studio and
+  compile the ALL_BUILD project.
+
+
+The [ArrayFire CMake Example](https://github.com/arrayfire/arrayfire-project-templates/tree/master/CMake)
+is a CMake project used to demo how ArrayFire can be using with a CMake project.
 
-### Adding ArrayFire CUDA to a new/existing project
+Note: The CMake project does not add the post build event to copy the NVVM DLLs
+in case of CUDA backend. You will need to either copy it manually to the exectuable
+directory, or pick another solution for it.
 
-Lastly, if your project contains custom CUDA code, the instructions are slightly
-different:
-
-1. Create a custom "CUDA NVCC project" in Visual Studio
-2. Follow steps 2-5 from the _Adding ArrayFire CPU/OpenCL to a new/existing project_
-   instructions above
-3. Add the following lines to the
-   _Project Properties -> Build Events -> Post Build Events_
-   dialog:
-
-     ```
-     echo copy "$(CUDA_PATH)\nvvm\bin\nvvm64*.dll" "$(OutDir)"
-     copy "$(CUDA_PATH)\nvvm\bin\nvvm64*.dll" "$(OutDir)"
-     ```
-
-4. Ensure that you use x64 based configurations.
-
-Please note that this method will not work with the ArrayFire examples as
-our implementations are built with the Visual Studio CL compiler rather than
-NVCC to ensure they are supported across various platforms.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0c66486080..a795916eb3 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -14,12 +14,12 @@ if(TARGET afcpu OR TARGET afcuda OR TARGET afopencl)
     SET(ArrayFire_OpenCL_FOUND False)
     SET(ASSETS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../assets")
     IF(NOT EXISTS "${ASSETS_DIR}/LICENSE")
-        MESSAGE(WARNING "Arrayfire assets are not available. Assets will not be installed.")
-        MESSAGE("Did you miss the --recursive option when cloning?")
-        MESSAGE("Run the following commands to correct this:")
-        MESSAGE("git submodule init")
-        MESSAGE("git submodule update")
-        MESSAGE("git submodule foreach git pull origin master")
+        MESSAGE(STATUS "Assests submodule unavailable. Updating submodules.")
+        EXECUTE_PROCESS(
+            COMMAND git submodule update --init --recursive
+            WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+            OUTPUT_QUIET
+        )
     ENDIF()
 else()
     FIND_PACKAGE(ArrayFire REQUIRED)
@@ -81,6 +81,21 @@ else()
   MESSAGE(STATUS "EXAMPLES: CPU backend is OFF. afcpu was not found.")
 endif()
 
+# Next we build each example using every backend.
+if(${ArrayFire_Unified_FOUND})  # variable defined by FIND(ArrayFire ...)
+  MESSAGE(STATUS "EXAMPLES: UNIFIED backend is ON.")
+  BUILD_ALL("${FILES}" unified ${ArrayFire_Unified_LIBRARIES} "")
+elseif(TARGET af)        # variable defined by the ArrayFire build tree
+  MESSAGE(STATUS "EXAMPLES: UNIFIED backend is ON.")
+  IF(WIN32)
+      BUILD_ALL("${FILES}" unified af "")
+  ELSE()
+      BUILD_ALL("${FILES}" unified af "dl")
+  ENDIF()
+else()
+  MESSAGE(STATUS "EXAMPLES: UNIFIED backend is OFF. af was not found.")
+endif()
+
 if (${CUDA_FOUND})
   if(${ArrayFire_CUDA_FOUND})  # variable defined by FIND(ArrayFire ...)
     FIND_LIBRARY( CUDA_NVVM_LIBRARY
diff --git a/examples/common/progress.h b/examples/common/progress.h
index debb511e1a..6452aa2a5b 100644
--- a/examples/common/progress.h
+++ b/examples/common/progress.h
@@ -36,7 +36,7 @@ static bool progress(unsigned iter_curr, af::timer t, double time_total)
 
     if (time_curr < time_total) return true;
 
-    printf(" ### vortex %f iterations per second (max)\n", max_rate);
+    printf(" ### %f iterations per second (max)\n", max_rate);
     return false;
 }
 
diff --git a/examples/graphics/plot3.cpp b/examples/graphics/plot3.cpp
new file mode 100644
index 0000000000..ea2ca8d53d
--- /dev/null
+++ b/examples/graphics/plot3.cpp
@@ -0,0 +1,58 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <arrayfire.h>
+#include <cstdio>
+#include <math.h>
+
+using namespace af;
+
+static const int ITERATIONS = 200;
+static const float PRECISION = 1.0f/ITERATIONS;
+
+int main(int argc, char *argv[])
+{
+    try {
+        // Initialize the kernel array just once
+        af::info();
+        af::Window myWindow(800, 800, "3D Line Plot example: ArrayFire");
+
+        static float t=0.1;
+        array Z = seq( 0.1f, 10.f, PRECISION);
+        array bounds = constant(1, Z.dims());
+
+        do{
+            array Y = sin((Z*t) + t) / Z;
+            array X = cos((Z*t) + t) / Z;
+            X = max(min(X, bounds),-bounds);
+            Y = max(min(Y, bounds),-bounds);
+
+            array Pts = join(1, X, Y, Z);
+            //Pts can be passed in as a matrix in the form n x 3, 3 x n
+            //or in the flattened xyz-triplet array with size 3n x 1
+            myWindow.plot3(Pts);
+
+            t+=0.01;
+        } while(!myWindow.close());
+
+    } catch (af::exception& e) {
+        fprintf(stderr, "%s\n", e.what());
+        throw;
+    }
+
+    #ifdef WIN32 // pause in Windows
+    if (!(argc == 2 && argv[1][0] == '-')) {
+        printf("hit [enter]...");
+        fflush(stdout);
+        getchar();
+    }
+    #endif
+    return 0;
+}
+
diff --git a/examples/graphics/surface.cpp b/examples/graphics/surface.cpp
new file mode 100644
index 0000000000..92d5185d16
--- /dev/null
+++ b/examples/graphics/surface.cpp
@@ -0,0 +1,55 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <arrayfire.h>
+#include <cstdio>
+#include <math.h>
+
+using namespace af;
+
+static const int ITERATIONS = 30;
+static const float PRECISION = 1.0f/ITERATIONS;
+
+int main(int argc, char *argv[])
+{
+    try {
+        // Initialize the kernel array just once
+        af::info();
+        af::Window myWindow(800, 800, "3D Surface example: ArrayFire");
+
+        array X = seq(-1, 1, PRECISION);
+        array Y = seq(-1, 1, PRECISION);
+        array Z = randn(X.dims(0), Y.dims(0));
+
+        static float t=0;
+        while(!myWindow.close()) {
+            t+=0.07;
+            //Z = sin(tile(X,1, Y.dims(0))*t + t) + cos(transpose(tile(Y, 1, X.dims(0)))*t + t);
+            array x = tile(X,1, Y.dims(0));
+            array y = transpose(tile(Y, 1, X.dims(0)));
+            Z = 10*x*-abs(y) * cos(x*x*(y+t))+sin(y*(x+t))-1.5;
+
+            myWindow.surface(X, Y, Z, NULL);
+        }
+
+    } catch (af::exception& e) {
+        fprintf(stderr, "%s\n", e.what());
+        throw;
+    }
+
+    #ifdef WIN32 // pause in Windows
+    if (!(argc == 2 && argv[1][0] == '-')) {
+        printf("hit [enter]...");
+        fflush(stdout);
+        getchar();
+    }
+    #endif
+    return 0;
+}
+
diff --git a/examples/pde/swe.cpp b/examples/pde/swe.cpp
new file mode 100644
index 0000000000..84ce1ff4de
--- /dev/null
+++ b/examples/pde/swe.cpp
@@ -0,0 +1,86 @@
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <arrayfire.h>
+#include "../common/progress.h"
+
+using namespace af;
+
+Window *win;
+
+array normalize(array a, float max)
+{
+    float mx = max * 0.5;
+    float mn = -max * 0.5;
+    return (a-mn)/(mx-mn);
+}
+
+static void swe(bool console)
+{
+    double time_total = 20; // run for N seconds
+    // Grid length, number and spacing
+    const unsigned Lx = 512, nx = Lx + 1;
+    const unsigned Ly = 512, ny = Ly + 1;
+    const float dx = Lx / (nx - 1);
+    const float dy = Ly / (ny - 1);
+
+    array ZERO = constant(0, nx, ny);
+    array um = ZERO, vm = ZERO;
+    unsigned io = (unsigned)floor(Lx  / 5.0f),
+             jo = (unsigned)floor(Ly / 5.0f),
+             k = 20;
+    array x = tile(moddims(seq(nx),nx,1), 1,ny);
+    array y = tile(moddims(seq(ny),1,ny), nx,1);
+
+    // Initial condition
+    array etam = 0.01f * exp((-((x - io) * (x - io) + (y - jo) * (y - jo))) / (k * k));
+    float m_eta = max<float>(etam);
+    array eta = etam;
+    float dt = 0.5;
+
+    // conv kernels
+    float h_diff_kernel[] = {9.81f * (dt / dx), 0, -9.81f * (dt / dx)};
+    float h_lap_kernel[] = {0, 1, 0, 1, -4, 1, 0, 1, 0};
+
+    array h_diff_kernel_arr(3, h_diff_kernel);
+    array h_lap_kernel_arr(3, 3, h_lap_kernel);
+
+    if(!console) {
+        win = new Window(512, 512,"Shallow Water Equations");
+        win->setColorMap(AF_COLORMAP_MOOD);
+    }
+
+    timer t = timer::start();
+    unsigned iter = 0;
+    while (progress(iter, t, time_total)) {
+        // compute
+        array up = um + convolve(eta, h_diff_kernel_arr);
+        array vp = um + convolve(eta, h_diff_kernel_arr.T());
+        array e = convolve(eta, h_lap_kernel_arr);
+        array etap = 2 * eta - etam + (2 * dt * dt) / (dx * dy) * e;
+
+        etam = eta;
+        eta = etap;
+        if (!console) {
+            win->image(normalize(eta, m_eta));
+            // viz
+        } else eval(eta, up, vp);
+        iter++;
+    }
+}
+int main(int argc, char* argv[])
+{
+    int device = argc > 1 ? atoi(argv[1]) : 0;
+    bool console = argc > 2 ? argv[2][0] == '-' : false;
+    try {
+        af::setDevice(device);
+        af::info();
+        printf("Simulation of shallow water equations\n");
+        swe(console);
+    } catch (af::exception& e) {
+        fprintf(stderr, "%s\n", e.what());
+        throw;
+    }
+    return 0;
+}
diff --git a/examples/unified/basic.cpp b/examples/unified/basic.cpp
new file mode 100644
index 0000000000..31d1eacfca
--- /dev/null
+++ b/examples/unified/basic.cpp
@@ -0,0 +1,78 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <arrayfire.h>
+#include <vector>
+#include <algorithm>
+
+using namespace af;
+
+std::vector<float> input(100);
+
+// Generate a random number between 0 and 1
+// return a uniform number in [0,1].
+double unifRand()
+{
+    return rand() / double(RAND_MAX);
+}
+
+void testBackend()
+{
+    af::info();
+
+    af::dim4 dims(10, 10, 1, 1);
+
+    af::array A(dims, &input.front());
+    af_print(A);
+
+    af::array B = af::constant(0.5, dims, f32);
+    af_print(B);
+}
+
+int main(int argc, char *argv[])
+{
+    std::generate(input.begin(), input.end(), unifRand);
+
+    try {
+        printf("Trying CPU Backend\n");
+        af::setBackend(AF_BACKEND_CPU);
+        testBackend();
+    } catch (af::exception& e) {
+        printf("Caught exception when trying CPU backend\n");
+        fprintf(stderr, "%s\n", e.what());
+    }
+
+    try {
+        printf("Trying CUDA Backend\n");
+        af::setBackend(AF_BACKEND_CUDA);
+        testBackend();
+    } catch (af::exception& e) {
+        printf("Caught exception when trying CUDA backend\n");
+        fprintf(stderr, "%s\n", e.what());
+    }
+
+    try {
+        printf("Trying OpenCL Backend\n");
+        af::setBackend(AF_BACKEND_OPENCL);
+        testBackend();
+    } catch (af::exception& e) {
+        printf("Caught exception when trying OpenCL backend\n");
+        fprintf(stderr, "%s\n", e.what());
+    }
+
+    #ifdef WIN32 // pause in Windows
+    if (!(argc == 2 && argv[1][0] == '-')) {
+        printf("hit [enter]...");
+        fflush(stdout);
+        getchar();
+    }
+    #endif
+
+    return 0;
+}
diff --git a/include/af/arith.h b/include/af/arith.h
index fc2cdc2a82..b5f6f17ba9 100644
--- a/include/af/arith.h
+++ b/include/af/arith.h
@@ -578,7 +578,7 @@ extern "C" {
     /**
        C Interface for dividing an array by another
 
-       \param[out] out will contain result of \p lhs / \p rhs
+       \param[out] out will contain result of \p lhs / \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -591,7 +591,7 @@ extern "C" {
     /**
        C Interface for checking if an array is less than another
 
-       \param[out] out will contain result of \p lhs < \p rhs
+       \param[out] out will contain result of \p lhs < \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -604,7 +604,7 @@ extern "C" {
     /**
        C Interface for checking if an array is greater than another
 
-       \param[out] out will contain result of \p lhs > \p rhs
+       \param[out] out will contain result of \p lhs > \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -617,7 +617,7 @@ extern "C" {
     /**
        C Interface for checking if an array is less or equal to another
 
-       \param[out] out will contain result of \p lhs <= \p rhs
+       \param[out] out will contain result of \p lhs <= \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -630,7 +630,7 @@ extern "C" {
     /**
        C Interface for checking if an array is greater or equal to another
 
-       \param[out] out will contain result of \p lhs >= \p rhs
+       \param[out] out will contain result of \p lhs >= \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -643,7 +643,7 @@ extern "C" {
     /**
        C Interface for checking if an array is equal to another
 
-       \param[out] out will contain result of \p lhs == \p rhs
+       \param[out] out will contain result of \p lhs == \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -656,7 +656,7 @@ extern "C" {
     /**
        C Interface for checking if an array is not equal to another
 
-       \param[out] out will contain result of \p lhs != \p rhs
+       \param[out] out will contain result of \p lhs != \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -669,7 +669,7 @@ extern "C" {
     /**
        C Interface for performing logical and on two arrays
 
-       \param[out] out will contain result of \p lhs && \p rhs
+       \param[out] out will contain result of \p lhs && \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -682,7 +682,7 @@ extern "C" {
     /**
        C Interface for performing logical or on two arrays
 
-       \param[out] out will contain result of \p lhs || \p rhs
+       \param[out] out will contain result of \p lhs || \p rhs. out is of type b8
        \param[in] lhs first input
        \param[in] rhs second input
        \param[in] batch specifies if operations need to be performed in batch mode
@@ -695,7 +695,7 @@ extern "C" {
     /**
        C Interface for performing logical not on input
 
-       \param[out] out will contain result of logical not of \p in
+       \param[out] out will contain result of logical not of \p in. out is of type b8
        \param[in] in is the input
        \return \ref AF_SUCCESS if the execution completes properly
 
diff --git a/include/af/array.h b/include/af/array.h
index bdc6502208..a5f39e7793 100644
--- a/include/af/array.h
+++ b/include/af/array.h
@@ -84,6 +84,19 @@ namespace af
             ASSIGN(/=)
 #undef ASSIGN
 
+#if AF_API_VERSION >= 32
+#define ASSIGN(OP)                                                  \
+            array_proxy& operator OP(const short &a);               \
+            array_proxy& operator OP(const unsigned short &a);      \
+
+            ASSIGN(=)
+            ASSIGN(+=)
+            ASSIGN(-=)
+            ASSIGN(*=)
+            ASSIGN(/=)
+#undef ASSIGN
+#endif
+
             // af::array member functions. same behavior as those below
             af_array get();
             af_array get() const;
@@ -627,7 +640,7 @@ namespace af
         bool isfloating() const;
 
         /**
-           \brief Returns true if the array type is \ref u8, \ref b8, \ref s32 \ref u32, \ref s64, \ref u64
+           \brief Returns true if the array type is \ref u8, \ref b8, \ref s32 \ref u32, \ref s64, \ref u64, \ref s16, \ref u16
         */
         bool isinteger() const;
 
@@ -813,7 +826,7 @@ namespace af
         /// \ingroup method_mat
         array H() const;
 
-#define ASSIGN(OP)                                                                      \
+#define ASSIGN_(OP)                                                                     \
         array& OP(const array &val);                                                    \
         array& OP(const double &val);              /**< \copydoc OP (const array &) */  \
         array& OP(const cdouble &val);             /**< \copydoc OP (const array &) */  \
@@ -829,6 +842,17 @@ namespace af
         array& OP(const long long  &val);          /**< \copydoc OP (const array &) */  \
         array& OP(const unsigned long long &val);  /**< \copydoc OP (const array &) */  \
 
+#if AF_API_VERSION >= 32
+#define ASSIGN(OP)                                                                      \
+        ASSIGN_(OP)                                                                     \
+        array& OP(const short  &val);              /**< \copydoc OP (const array &) */  \
+        array& OP(const unsigned short &val);      /**< \copydoc OP (const array &) */  \
+
+#else
+#define ASSIGN(OP) ASSIGN_(OP)
+#endif
+
+
         /// \ingroup array_mem_operator_eq
         /// @{
         /// \brief Assignes the value(s) of val to the elements of the array.
@@ -892,6 +916,7 @@ namespace af
 
 
 #undef ASSIGN
+#undef ASSIGN_
 
         ///
         /// \brief Negates the values of the array
@@ -930,7 +955,7 @@ namespace af
     };
     // end of class array
 
-#define BIN_OP(OP)                                                                                                       \
+#define BIN_OP_(OP)                                                                                                      \
     AFAPI array OP (const array& lhs, const array& rhs);                                                                 \
     AFAPI array OP (const bool& lhs, const array& rhs);                 /**< \copydoc OP (const array&, const array&) */ \
     AFAPI array OP (const int& lhs, const array& rhs);                  /**< \copydoc OP (const array&, const array&) */ \
@@ -959,6 +984,18 @@ namespace af
     AFAPI array OP (const array& lhs, const cfloat& rhs);               /**< \copydoc OP (const array&, const array&) */ \
     AFAPI array OP (const array& lhs, const cdouble& rhs);              /**< \copydoc OP (const array&, const array&) */ \
 
+#if AF_API_VERSION >= 32
+#define BIN_OP(OP)                                                                                                       \
+        BIN_OP_(OP)                                                                                                      \
+        AFAPI array OP (const short& lhs, const array& rhs);            /**< \copydoc OP (const array&, const array&) */ \
+        AFAPI array OP (const unsigned short& lhs, const array& rhs);   /**< \copydoc OP (const array&, const array&) */ \
+        AFAPI array OP (const array& lhs, const short& rhs);            /**< \copydoc OP (const array&, const array&) */ \
+        AFAPI array OP (const array& lhs, const unsigned short& rhs);   /**< \copydoc OP (const array&, const array&) */ \
+
+#else
+#define BIN_OP(OP) BIN_OP_(OP)
+#endif
+
     /// \ingroup arith_func_add
     /// @{
     /// \brief Adds two arrays or an array and a value.
@@ -1010,7 +1047,7 @@ namespace af
     /// \param[in] lhs the left hand side value of the operand
     /// \param[in] rhs the right hand side value of the operand
     ///
-    /// \returns an array with the equality operation performed on each element
+    /// \returns an array of type b8 with the equality operation performed on each element
     BIN_OP(operator==)
     /// @}
 
@@ -1021,7 +1058,7 @@ namespace af
     /// \param[in] lhs the left hand side value of the operand
     /// \param[in] rhs the right hand side value of the operand
     ///
-    /// \returns    an array with the != operation performed on each element
+    /// \returns    an array of type b8 with the != operation performed on each element
     ///             of \p lhs and \p rhs
     BIN_OP(operator!=)
     /// @}
@@ -1033,7 +1070,7 @@ namespace af
     /// \param[in] lhs the left hand side value of the operand
     /// \param[in] rhs the right hand side value of the operand
     ///
-    /// \returns    an array with the < operation performed on each element
+    /// \returns    an array of type b8 with the < operation performed on each element
     ///             of \p lhs and \p rhs
     BIN_OP(operator< )
     /// @}
@@ -1045,7 +1082,7 @@ namespace af
     /// \param[in] lhs the left hand side value of the operand
     /// \param[in] rhs the right hand side value of the operand
     ///
-    /// \returns    an array with the <= operation performed on each element
+    /// \returns    an array of type b8 with the <= operation performed on each element
     ///             of \p lhs and \p rhs
     BIN_OP(operator<=)
     /// @}
@@ -1057,7 +1094,7 @@ namespace af
     /// \param[in] lhs the left hand side value of the operand
     /// \param[in] rhs the right hand side value of the operand
     ///
-    /// \returns    an array with the > operation performed on each element
+    /// \returns    an array of type b8 with the > operation performed on each element
     ///             of \p lhs and \p rhs
     BIN_OP(operator> )
     /// @}
@@ -1069,7 +1106,7 @@ namespace af
     /// \param[in] lhs the left hand side value of the operand
     /// \param[in] rhs the right hand side value of the operand
     ///
-    /// \returns    an array with the >= operation performed on each element
+    /// \returns    an array of type b8 with the >= operation performed on each element
     ///             of \p lhs and \p rhs
     BIN_OP(operator>=)
     /// @}
@@ -1082,7 +1119,7 @@ namespace af
     /// \param[in] lhs the left hand side value of the operand
     /// \param[in] rhs the right hand side value of the operand
     ///
-    /// \returns    an array with a logical AND operation performed on each
+    /// \returns    an array of type b8 with a logical AND operation performed on each
     ///             element of \p lhs and \p rhs
     BIN_OP(operator&&)
     /// @}
@@ -1095,7 +1132,7 @@ namespace af
     /// \param[in] lhs the left hand side value of the operand
     /// \param[in] rhs the right hand side value of the operand
     ///
-    /// \returns    an array with a logical OR operation performed on each
+    /// \returns    an array of type b8 with a logical OR operation performed on each
     ///             element of \p lhs and \p rhs
     BIN_OP(operator||)
     /// @}
@@ -1178,6 +1215,7 @@ namespace af
     /// @}
 
 #undef BIN_OP
+#undef BIN_OP_
 
     /// Evaluate an expression (nonblocking).
     /**
diff --git a/include/af/backend.h b/include/af/backend.h
new file mode 100644
index 0000000000..93d8d8de58
--- /dev/null
+++ b/include/af/backend.h
@@ -0,0 +1,105 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+   \param[in] bknd takes one of the values of enum \ref af_backend
+   \returns \ref af_err error code
+
+   \ingroup unified_func_setbackend
+ */
+AFAPI af_err af_set_backend(const af_backend bknd);
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+   \param[out] num_backends Number of available backends
+   \returns \ref af_err error code
+
+   \ingroup unified_func_getbackendcount
+ */
+AFAPI af_err af_get_backend_count(unsigned* num_backends);
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+   \param[out] backends is the OR sum of the backends available.
+   \returns \ref af_err error code
+
+   \ingroup unified_func_getavailbackends
+ */
+AFAPI af_err af_get_available_backends(int* backends);
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+   \param[out] backend takes one of the values of enum \ref af_backend
+   \param[in] in is the array who's backend is to be queried
+   \returns \ref af_err error code
+
+   \ingroup unified_func_getbackendid
+ */
+AFAPI af_err af_get_backend_id(af_backend *backend, const af_array in);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+namespace af
+{
+class array;
+
+#if AF_API_VERSION >= 32
+/**
+   \param[in] bknd takes one of the values of enum \ref af_backend
+
+   \ingroup unified_func_setbackend
+ */
+AFAPI void setBackend(const Backend bknd);
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+   \returns Number of available backends
+
+   \ingroup unified_func_getbackendcount
+ */
+AFAPI unsigned getBackendCount();
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+   \returns OR sum of the backends available
+
+   \ingroup unified_func_getavailbackends
+ */
+AFAPI int getAvailableBackends();
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+   \param[in] in is the array who's backend is to be queried
+   \returns \ref af_backend which is the backend on which the array is created
+
+   \ingroup unified_func_getbackendid
+ */
+AFAPI af::Backend getBackendId(const array &in);
+#endif
+
+}
+#endif
diff --git a/include/af/cuda.h b/include/af/cuda.h
index 7cc3cd6501..5b5e25bb65 100644
--- a/include/af/cuda.h
+++ b/include/af/cuda.h
@@ -42,6 +42,18 @@ AFAPI af_err afcu_get_stream(cudaStream_t* stream, int id);
 AFAPI af_err afcu_get_native_id(int* nativeid, int id);
 #endif
 
+#if AF_API_VERSION >= 32
+/**
+   Set the CUDA device with given native id as the active device for ArrayFire
+
+   \param[in] nativeid native device id of the CUDA device
+   \returns \ref af_err error code
+
+   \ingroup cuda_mat
+ */
+AFAPI af_err afcu_set_native_id(int nativeid);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
@@ -89,5 +101,21 @@ static inline int getNativeId(int id)
 }
 #endif
 
+#if AF_API_VERSION >= 32
+/**
+   Set the CUDA device with given native id as the active device for ArrayFire
+
+   \param[in] nativeId native device id of the CUDA device
+
+   \ingroup cuda_mat
+ */
+static inline void setNativeId(int nativeId)
+{
+    af_err err = afcu_set_native_id(nativeId);
+    if (err!=AF_SUCCESS)
+        throw af::exception("Failed to change active CUDA device to the device with given native id");
+}
+#endif
+
 }
 #endif
diff --git a/include/af/defines.h b/include/af/defines.h
index c2b8eadc37..a25d23996d 100644
--- a/include/af/defines.h
+++ b/include/af/defines.h
@@ -132,10 +132,12 @@ typedef enum {
     ///
     AF_ERR_NOT_CONFIGURED = 302,
 
+#if AF_API_VERSION >= 32
     ///
     /// This build of ArrayFire is not compiled with "nonfree" algorithms
     ///
-    AFF_ERR_NONFREE       = 303,
+    AF_ERR_NONFREE        = 303,
+#endif
 
     // 400-499 Errors for missing hardware features
 
@@ -149,6 +151,30 @@ typedef enum {
     /// not support graphics
     ///
     AF_ERR_NO_GFX         = 402,
+
+    // 500-599 Errors specific to heterogenous API
+
+#if AF_API_VERSION >= 32
+    ///
+    /// There was an error when loading the libraries
+    ///
+    AF_ERR_LOAD_LIB       = 501,
+#endif
+
+#if AF_API_VERSION >= 32
+    ///
+    /// There was an error when loading the symbols
+    ///
+    AF_ERR_LOAD_SYM       = 502,
+#endif
+
+#if AF_API_VERSION >= 32
+    ///
+    /// There was a mismatch between the input array and the active backend
+    ///
+    AF_ERR_ARR_BKND_MISMATCH    = 503,
+#endif
+
     // 900-999 Errors from upstream libraries and runtimes
 
     ///
@@ -168,12 +194,18 @@ typedef enum {
     c32,    ///< 32-bit complex floating point values
     f64,    ///< 64-bit complex floating point values
     c64,    ///< 64-bit complex floating point values
-    b8,     ///< 8-bit boolean values
+    b8 ,    ///< 8-bit boolean values
     s32,    ///< 32-bit signed integral values
     u32,    ///< 32-bit unsigned integral values
-    u8,     ///< 8-bit unsigned integral values
+    u8 ,    ///< 8-bit unsigned integral values
     s64,    ///< 64-bit signed integral values
-    u64     ///< 64-bit unsigned integral values
+    u64,    ///< 64-bit unsigned integral values
+#if AF_API_VERSION >= 32
+    s16,    ///< 16-bit signed integral values
+#endif
+#if AF_API_VERSION >= 32
+    u16,    ///< 16-bit unsigned integral values
+#endif
 } af_dtype;
 
 typedef enum {
@@ -249,17 +281,21 @@ typedef enum {
     AF_SHD        ///< Match based on Sum of Hamming Distances (SHD)
 } af_match_type;
 
+#if AF_API_VERSION >= 31
 typedef enum {
     AF_YCC_601 = 601,  ///< ITU-R BT.601 (formerly CCIR 601) standard
     AF_YCC_709 = 709,  ///< ITU-R BT.709 standard
     AF_YCC_2020 = 2020  ///< ITU-R BT.2020 standard
 } af_ycc_std;
+#endif
 
 typedef enum {
     AF_GRAY = 0, ///< Grayscale
     AF_RGB,      ///< 3-channel RGB
     AF_HSV,      ///< 3-channel HSV
+#if AF_API_VERSION >= 31
     AF_YCbCr     ///< 3-channel YCbCr
+#endif
 } af_cspace_t;
 
 typedef enum {
@@ -300,6 +336,7 @@ typedef enum {
     AF_COLORMAP_BLUE    = 6     ///< Blue hue map
 } af_colormap;
 
+#if AF_API_VERSION >= 31
 typedef enum {
     AF_FIF_BMP          = 0,    ///< FreeImage Enum for Bitmap File
     AF_FIF_ICO          = 1,    ///< FreeImage Enum for Windows Icon File
@@ -315,6 +352,24 @@ typedef enum {
     AF_FIF_JP2          = 31,   ///< FreeImage Enum for JPEG-2000 File
     AF_FIF_RAW          = 34    ///< FreeImage Enum for RAW Camera Image File
 } af_image_format;
+#endif
+
+#if AF_API_VERSION >= 32
+typedef enum {
+    AF_HOMOGRAPHY_RANSAC = 0,   ///< Computes homography using RANSAC
+    AF_HOMOGRAPHY_LMEDS  = 1    ///< Computes homography using Least Median of Squares
+} af_homography_type;
+#endif
+
+#if AF_API_VERSION >= 32
+// These enums should be 2^x
+typedef enum {
+    AF_BACKEND_DEFAULT = 0,  ///< Default backend order: OpenCL -> CUDA -> CPU
+    AF_BACKEND_CPU     = 1,  ///< CPU a.k.a sequential algorithms
+    AF_BACKEND_CUDA    = 2,  ///< CUDA Compute Backend
+    AF_BACKEND_OPENCL  = 4,  ///< OpenCL Compute Backend
+} af_backend;
+#endif
 
 // Below enum is purely added for example purposes
 // it doesn't and shoudn't be used anywhere in the
@@ -340,8 +395,15 @@ namespace af
     typedef af_mat_prop matProp;
     typedef af_colormap ColorMap;
     typedef af_norm_type normType;
+#if AF_API_VERSION >= 31
     typedef af_ycc_std YCCStd;
+#endif
+#if AF_API_VERSION >= 31
     typedef af_image_format imageFormat;
+#endif
+#if AF_API_VERSION >= 32
+    typedef af_backend Backend;
+#endif
 }
 
 #endif
diff --git a/include/af/graphics.h b/include/af/graphics.h
index 1fd9108d7a..5c143c721e 100644
--- a/include/af/graphics.h
+++ b/include/af/graphics.h
@@ -47,6 +47,8 @@ class AFAPI Window {
         /**
            Creates a window object with default width
            and height with title set to "ArrayFire"
+
+           \ingroup gfx_func_window
          */
         Window();
 
@@ -55,6 +57,8 @@ class AFAPI Window {
            and height using the title provided by the user
 
            \param[in] title is the window title
+
+           \ingroup gfx_func_window
          */
         Window(const char* const title);
 
@@ -65,6 +69,8 @@ class AFAPI Window {
            \param[in] width is the window width
            \param[in] height is the window height
            \param[in] title is the window title with default value as "ArrayFire"
+
+           \ingroup gfx_func_window
          */
         Window(const int width, const int height, const char* const title="ArrayFire");
 
@@ -74,10 +80,14 @@ class AFAPI Window {
 
            \param[in] wnd is an \ref af_window handle which can be retrieved by
            doing a get call on any \ref Window object
+
+           \ingroup gfx_func_window
          */
         Window(const af_window wnd);
         /**
            Destroys the window handle
+
+           \ingroup gfx_func_window
          */
         ~Window();
 
@@ -85,6 +95,8 @@ class AFAPI Window {
 
         /**
            \return Returns the \ref af_window window handle.
+
+           \ingroup gfx_func_window
          */
         af_window get() const { return wnd; }
 
@@ -93,6 +105,8 @@ class AFAPI Window {
 
            \param[in] x is horizontal coordinate
            \param[in] y is vertical coordinate
+
+           \ingroup gfx_func_window
          */
         void setPos(const unsigned x, const unsigned y);
 
@@ -100,6 +114,8 @@ class AFAPI Window {
            Set the window title
 
            \param[in] title is the window title
+
+           \ingroup gfx_func_window
          */
         void setTitle(const char* const title);
 
@@ -109,6 +125,8 @@ class AFAPI Window {
 
            \param[in]   w is target width of the window
            \param[in]   h is target height of the window
+
+           \ingroup gfx_func_window
          */
         void setSize(const unsigned w, const unsigned h);
 #endif
@@ -117,6 +135,8 @@ class AFAPI Window {
            Set the colormap to be used for subsequent rendering calls
 
            \param[in] cmap should be one of the enum values from \ref ColorMap
+
+           \ingroup gfx_func_window
          */
         void setColorMap(const ColorMap cmap);
 
@@ -127,9 +147,25 @@ class AFAPI Window {
            \param[in] title parameter is used when this function is called in grid mode
 
            \note \p in should be 2d array or 3d array with 3 channels.
+
+           \ingroup gfx_func_draw
          */
         void image(const array& in, const char* title=NULL);
 
+#if AF_API_VERSION >= 32
+        /**
+           Renders the input array as an 3d line plot to the window
+
+           \param[in] in is an \ref array
+           \param[in] title parameter is used when this function is called in grid mode
+
+           \note \p in should be 1d array of size 3n or 2d array with (3 x n) or (n x 3) channels.
+
+           \ingroup gfx_func_draw
+         */
+        void plot3(const array& in, const char* title=NULL);
+#endif
+
         /**
            Renders the input arrays as a 2D plot to the window
 
@@ -138,7 +174,10 @@ class AFAPI Window {
            \param[in] title parameter is used when this function is called in grid mode
 
            \note \p X and \p Y should be vectors.
+
+           \ingroup gfx_func_draw
          */
+
         void plot(const array& X, const array& Y, const char* const title=NULL);
 
         /**
@@ -150,20 +189,56 @@ class AFAPI Window {
            \param[in] title parameter is used when this function is called in grid mode
 
            \note \p X should be a vector.
+
+           \ingroup gfx_func_draw
          */
         void hist(const array& X, const double minval, const double maxval, const char* const title=NULL);
 
+#if AF_API_VERSION >= 32
+        /**
+           Renders the input arrays as a 3D surface plot to the window
+
+           \param[in] S is an \ref array with the z-axis data points
+           \param[in] title parameter is used when this function is called in grid mode
+
+           \note \p S should be a 2D array
+
+           \ingroup gfx_func_draw
+         */
+        void surface(const array& S, const char* const title);
+#endif
+
+#if AF_API_VERSION >= 32
+        /**
+           Renders the input arrays as a 3D surface plot to the window
+
+           \param[in] xVals is an \ref array with the x-axis data points
+           \param[in] yVals is an \ref array with the y-axis data points
+           \param[in] S is an \ref array with the z-axis data points
+           \param[in] title parameter is used when this function is called in grid mode
+
+           \note \p X and \p Y should be vectors or 2D arrays \p S should be s 2D array
+
+           \ingroup gfx_func_draw
+         */
+        void surface(const array& xVals, const array& yVals, const array& S, const char* const title);
+#endif
+
         /**
            Setup grid layout for multiview mode in a window
 
            \param[in]   rows is number of rows you want to show in a window
            \param[in]   cols is number of coloumns you want to show in a window
+
+           \ingroup gfx_func_window
         */
         void grid(const int rows, const int cols);
 
         /**
            This function swaps the background buffer to current view
            and polls for any key strokes while the window was in focus
+
+           \ingroup gfx_func_window
         */
         void show();
 
@@ -173,6 +248,8 @@ class AFAPI Window {
 
            \return     \ref AF_SUCCESS if window show is successful, otherwise an appropriate error code
            is returned.
+
+           \ingroup gfx_func_window
         */
         bool close();
 
@@ -185,6 +262,8 @@ class AFAPI Window {
 
            \return a reference to the object pointed by this
            to enable cascading this call with rendering functions.
+
+           \ingroup gfx_window_func
          */
         inline Window& operator()(const int r, const int c) {
             _r = r; _c = c;
@@ -210,7 +289,7 @@ extern "C" {
    \return     \ref AF_SUCCESS if window creation is successful, otherwise an appropriate error code
    is returned.
 
-   \ingroup gfx_window_func
+   \ingroup gfx_func_window
 */
 AFAPI af_err af_create_window(af_window *out, const int width, const int height, const char* const title);
 
@@ -292,6 +371,25 @@ AFAPI af_err af_draw_image(const af_window wind, const af_array in, const af_cel
 */
 AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props);
 
+#if AF_API_VERSION >= 32
+/**
+   C Interface wrapper for drawing an array as a plot
+
+   \param[in]   wind is the window handle
+   \param[in]   P is an \ref af_array or matrix with the xyz-values of the points
+   \param[in]   props is structure \ref af_cell that has the properties that are used
+   for the current rendering.
+
+   \return     \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code
+   is returned.
+
+   \note \p P should be a 3n x 1  vector or one of a 3xn or nx3 matrices.
+
+   \ingroup gfx_func_draw
+*/
+AFAPI af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props);
+#endif
+
 /**
    C Interface wrapper for drawing an array as a histogram
 
@@ -311,6 +409,27 @@ AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array
 */
 AFAPI af_err af_draw_hist(const af_window wind, const af_array X, const double minval, const double maxval, const af_cell* const props);
 
+#if AF_API_VERSION >= 32
+/**
+   C Interface wrapper for drawing arrayis as a surface
+
+   \param[in]   wind is the window handle
+   \param[in]   xVals is an \ref af_array with the x-axis data points
+   \param[in]   yVals is an \ref af_array with the y-axis data points
+   \param[in]   S is an \ref af_array with the z-axis data points
+   \param[in]   props is structure \ref af_cell that has the properties that are used
+   for the current rendering.
+
+   \return     \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code
+   is returned.
+
+   \note \p X and \p Y should be vectors. \p S should be a 2D array
+
+   \ingroup gfx_func_draw
+*/
+af_err af_draw_surface(const af_window wind, const af_array xVals, const af_array yVals, const af_array S, const af_cell* const props);
+#endif
+
 /**
    C Interface wrapper for grid setup in a window
 
diff --git a/include/af/image.h b/include/af/image.h
index 1c16280c12..f38bb41694 100644
--- a/include/af/image.h
+++ b/include/af/image.h
@@ -96,6 +96,57 @@ AFAPI void* saveImageMem(const array& in, const imageFormat format = AF_FIF_PNG)
 AFAPI void deleteImageMem(void *ptr);
 #endif
 
+#if AF_API_VERSION >= 32
+/**
+    C++ Interface for loading an image as its original type
+
+    This load image function allows you to load images as u8, u16 or f32
+    depending on the type of input image as shown by the table below.
+
+     Bits per Color (Gray/RGB/RGBA Bits Per Pixel) | Array Type  | Range
+    -----------------------------------------------|-------------|---------------
+      8 ( 8/24/32  BPP)                            | u8          | 0 - 255
+     16 (16/48/64  BPP)                            | u16         | 0 - 65535
+     32 (32/96/128 BPP)                            | f32         | 0 - 1
+
+    \param[in] filename is name of file to be loaded
+    \return image loaded as \ref af::array()
+
+    \ingroup imageio_func_load
+*/
+AFAPI array loadImageNative(const char* filename);
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+    C++ Interface for saving an image without modifications
+
+    This function only accepts u8, u16, f32 arrays. These arrays are saved to
+    images without any modifications.
+
+    You must also note that note all image type support 16 or 32 bit images.
+
+    The best options for 16 bit images are PNG, PPM and TIFF.
+    The best option for 32 bit images is TIFF.
+    These allow lossless storage.
+
+    The images stored have the following properties:
+
+     Array Type  | Bits per Color (Gray/RGB/RGBA Bits Per Pixel) | Range
+    -------------|-----------------------------------------------|---------------
+     u8          |  8 ( 8/24/32  BPP)                            | 0 - 255
+     u16         | 16 (16/48/64  BPP)                            | 0 - 65535
+     f32         | 32 (32/96/128 BPP)                            | 0 - 1
+
+    \param[in] filename is name of file to be saved
+    \param[in] in is the array to be saved. Should be u8 for saving 8-bit image,
+    u16 for 16-bit image, and f32 for 32-bit image.
+
+    \ingroup imageio_func_save
+*/
+AFAPI void saveImageNative(const char* filename, const array& in);
+#endif
+
 /**
     C++ Interface for resizing an image to specified dimensions
 
@@ -230,7 +281,7 @@ AFAPI array bilateral(const array &in, const float spatial_sigma, const float ch
    \param[in]  nbins  Number of bins to populate between min and max
    \param[in]  minval minimum bin value (accumulates -inf to min)
    \param[in]  maxval minimum bin value (accumulates max to +inf)
-   \return     histogram array
+   \return     histogram array of type u32
 
    \ingroup image_func_histogram
  */
@@ -243,7 +294,7 @@ AFAPI array histogram(const array &in, const unsigned nbins, const double minval
 
    \param[in]  in is the input array
    \param[in]  nbins  Number of bins to populate between min and max
-   \return     histogram array
+   \return     histogram array of type u32
 
    \ingroup image_func_histogram
  */
@@ -689,6 +740,60 @@ extern "C" {
     AFAPI af_err af_delete_image_memory(void* ptr);
 #endif
 
+#if AF_API_VERSION >= 32
+    /**
+        C Interface for loading an image as is original type
+
+        This load image function allows you to load images as u8, u16 or f32
+        depending on the type of input image as shown by the table below.
+
+         Bits per Color (Gray/RGB/RGBA Bits Per Pixel) | Array Type  | Range
+        -----------------------------------------------|-------------|---------------
+          8 ( 8/24/32  BPP)                            | u8          | 0 - 255
+         16 (16/48/64  BPP)                            | u16         | 0 - 65535
+         32 (32/96/128 BPP)                            | f32         | 0 - 1
+
+        \param[out] out contains them image
+        \param[in] filename is name of file to be loaded
+        \return     \ref AF_SUCCESS if successful
+
+        \ingroup imageio_func_load
+    */
+    AFAPI af_err af_load_image_native(af_array *out, const char* filename);
+#endif
+
+#if AF_API_VERSION >= 32
+    /**
+        C Interface for saving an image without modifications
+
+        This function only accepts u8, u16, f32 arrays. These arrays are saved to
+        images without any modifications.
+
+        You must also note that note all image type support 16 or 32 bit images.
+
+        The best options for 16 bit images are PNG, PPM and TIFF.
+        The best option for 32 bit images is TIFF.
+        These allow lossless storage.
+
+        The images stored have the following properties:
+
+         Array Type  | Bits per Color (Gray/RGB/RGBA Bits Per Pixel) | Range
+        -------------|-----------------------------------------------|---------------
+         u8          |  8 ( 8/24/32  BPP)                            | 0 - 255
+         u16         | 16 (16/48/64  BPP)                            | 0 - 65535
+         f32         | 32 (32/96/128 BPP)                            | 0 - 1
+
+        \param[in] filename is name of file to be saved
+        \param[in] in is the array to be saved. Should be u8 for saving 8-bit image,
+        u16 for 16-bit image, and f32 for 32-bit image.
+
+        \return     \ref AF_SUCCESS if successful
+
+        \ingroup imageio_func_save
+    */
+    AFAPI af_err af_save_image_native(const char* filename, const af_array in);
+#endif
+
     /**
        C Interface for resizing an image to specified dimensions
 
@@ -796,7 +901,7 @@ extern "C" {
     /**
        C Interface for histogram
 
-       \param[out] out is the histogram for input array in
+       \param[out] out (type u32) is the histogram for input array in
        \param[in]  in is the input array
        \param[in]  nbins  Number of bins to populate between min and max
        \param[in]  minval minimum bin value (accumulates -inf to min)
diff --git a/include/af/index.h b/include/af/index.h
index e3bb77b0fd..79bf1229a5 100644
--- a/include/af/index.h
+++ b/include/af/index.h
@@ -289,6 +289,78 @@ extern "C" {
                                 const dim_t ndims, const af_index_t* indices,
                                 const af_array rhs);
 
+#if AF_API_VERSION >= 32
+    ///
+    /// \brief Create an quadruple of af_index_t array
+    ///
+    /// \param[out] indexers pointer to location where quadruple af_index_t array is created
+    /// \returns \ref af_err error code
+    ///
+    /// \ingroup index_func_util
+    ///
+    AFAPI af_err af_create_indexers(af_index_t** indexers);
+#endif
+
+#if AF_API_VERSION >= 32
+    ///
+    /// \brief set \p dim to given indexer af_array \p idx
+    ///
+    /// \param[in] indexer pointer to location where quadruple af_index_t array was created
+    /// \param[in] idx is the af_array indexer for given dimension \p dim
+    /// \param[in] dim is the dimension to be indexed
+    /// \returns \ref af_err error code
+    ///
+    /// \ingroup index_func_util
+    ///
+    AFAPI af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim);
+#endif
+
+#if AF_API_VERSION >= 32
+    ///
+    /// \brief set \p dim to given indexer af_array \p idx
+    ///
+    /// \param[in] indexer pointer to location where quadruple af_index_t array was created
+    /// \param[in] idx is the af_seq indexer for given dimension \p dim
+    /// \param[in] dim is the dimension to be indexed
+    /// \param[in] is_batch indicates if the sequence based indexing is inside a batch operation
+    ///
+    /// \ingroup index_func_util
+    ///
+    AFAPI af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx,
+                                  const dim_t dim, const bool is_batch);
+#endif
+
+#if AF_API_VERSION >= 32
+    ///
+    /// \brief set \p dim to given indexer af_array \p idx
+    ///
+    /// \param[in] indexer pointer to location where quadruple af_index_t array was created
+    /// \param[in] begin is the beginning index of along dimension \p dim
+    /// \param[in] end is the beginning index of along dimension \p dim
+    /// \param[in] step size along dimension \p dim
+    /// \param[in] dim is the dimension to be indexed
+    /// \param[in] is_batch indicates if the sequence based indexing is inside a batch operation
+    /// \returns \ref af_err error code
+    ///
+    /// \ingroup index_func_util
+    ///
+    AFAPI af_err af_set_seq_param_indexer(af_index_t* indexer,
+                                        const double begin, const double end, const double step,
+                                        const dim_t dim, const bool is_batch);
+#endif
+
+#if AF_API_VERSION >= 32
+    ///
+    /// \brief Release's the memory resource used by the quadruple af_index_t array
+    ///
+    /// \param[in] indexers is pointer to location where quadruple af_index_t array is created
+    //  \returns \ref af_err error code
+    ///
+    /// \ingroup index_func_util
+    ///
+    AFAPI af_err af_release_indexers(af_index_t* indexers);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/af/macros.h b/include/af/macros.h
new file mode 100644
index 0000000000..42a4219ac8
--- /dev/null
+++ b/include/af/macros.h
@@ -0,0 +1,24 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <stdio.h>
+
+///
+/// Print a line on screen using printf syntax.
+/// Usage: Uses same syntax and semantics as printf.
+/// Output: \<filename\>:\<line number\>: \<message\>
+///
+#ifndef AF_MSG
+#define AF_MSG(fmt,...) do {            \
+        printf("%s:%d: " fmt "\n",      \
+                 __FILE__, __LINE__, ##__VA_ARGS__);      \
+        } while (0);
+#endif
+
diff --git a/include/af/opencl.h b/include/af/opencl.h
index c9f245e30a..271879fdc9 100644
--- a/include/af/opencl.h
+++ b/include/af/opencl.h
@@ -19,43 +19,54 @@
 extern "C" {
 #endif
 
-    /**
-        \ingroup opencl_mat
-        @{
-    */
-    /**
-      Get a handle to ArrayFire's OpenCL context
-
-      \param[out] ctx the current context being used by ArrayFire
-      \param[in] retain if true calls clRetainContext prior to returning the context
-      \returns \ref af_err error code
-
-      \note Set \p retain to true if this value will be passed to a cl::Context constructor
-    */
-    AFAPI af_err afcl_get_context(cl_context *ctx, const bool retain);
-
-    /**
-      Get a handle to ArrayFire's OpenCL command queue
-
-      \param[out] queue the current command queue being used by ArrayFire
-      \param[in] retain if true calls clRetainCommandQueue prior to returning the context
-      \returns \ref af_err error code
-
-      \note Set \p retain to true if this value will be passed to a cl::CommandQueue constructor
-    */
-    AFAPI af_err afcl_get_queue(cl_command_queue *queue, const bool retain);
-
-    /**
-       Get the device ID for ArrayFire's current active device
-
-       \param[out] id the cl_device_id of the current device
-       \returns \ref af_err error code
-    */
-    AFAPI af_err afcl_get_device_id(cl_device_id *id);
-
-    /**
-      @}
-    */
+/**
+    \ingroup opencl_mat
+    @{
+*/
+/**
+  Get a handle to ArrayFire's OpenCL context
+
+  \param[out] ctx the current context being used by ArrayFire
+  \param[in] retain if true calls clRetainContext prior to returning the context
+  \returns \ref af_err error code
+
+  \note Set \p retain to true if this value will be passed to a cl::Context constructor
+*/
+AFAPI af_err afcl_get_context(cl_context *ctx, const bool retain);
+
+/**
+  Get a handle to ArrayFire's OpenCL command queue
+
+  \param[out] queue the current command queue being used by ArrayFire
+  \param[in] retain if true calls clRetainCommandQueue prior to returning the context
+  \returns \ref af_err error code
+
+  \note Set \p retain to true if this value will be passed to a cl::CommandQueue constructor
+*/
+AFAPI af_err afcl_get_queue(cl_command_queue *queue, const bool retain);
+
+/**
+   Get the device ID for ArrayFire's current active device
+
+   \param[out] id the cl_device_id of the current device
+   \returns \ref af_err error code
+*/
+AFAPI af_err afcl_get_device_id(cl_device_id *id);
+
+#if AF_API_VERSION >= 32
+/**
+   Set ArrayFire's active device based on \p id of type cl_device_id
+
+   \param[in] id the cl_device_id of the device to be set as active device
+   \returns \ref af_err error code
+*/
+AFAPI af_err afcl_set_device_id(cl_device_id id);
+#endif
+
+/**
+  @}
+*/
+
 #ifdef __cplusplus
 }
 #endif
@@ -70,187 +81,205 @@ extern "C" {
 
 namespace afcl
 {
-   /**
-
-    */
-    /**
-        \ingroup opencl_mat
-        @{
-    */
-    /**
-    Get a handle to ArrayFire's OpenCL context
-
-    \param[in] retain if true calls clRetainContext prior to returning the context
-    \returns the current context being used by ArrayFire
-
-    \note Set \p retain to true if this value will be passed to a cl::Context constructor
-    */
-    static inline cl_context getContext(bool retain = false)
-    {
-        cl_context ctx;
-        af_err err = afcl_get_context(&ctx, retain);
-        if (err != AF_SUCCESS) throw af::exception("Failed to get OpenCL context from arrayfire");
-        return ctx;
-    }
-
-    /**
-    Get a handle to ArrayFire's OpenCL command queue
-
-    \param[in] retain if true calls clRetainCommandQueue prior to returning the context
-    \returns the current command queue being used by ArrayFire
-
-    \note Set \p retain to true if this value will be passed to a cl::CommandQueue constructor
-    */
-    static inline cl_command_queue getQueue(bool retain = false)
-    {
-        cl_command_queue queue;
-        af_err err = afcl_get_queue(&queue, retain);
-        if (err != AF_SUCCESS) throw af::exception("Failed to get OpenCL command queue from arrayfire");
-        return queue;
-    }
-
-    /**
-       Get the device ID for ArrayFire's current active device
-       \returns the cl_device_id of the current device
-    */
-    static inline cl_device_id getDeviceId()
-    {
-        cl_device_id id;
-        af_err err = afcl_get_device_id(&id);
-        if (err != AF_SUCCESS) throw af::exception("Failed to get OpenCL device ID");
-
-        return id;
-    }
-
-    /**
-    Create an af::array object from an OpenCL cl_mem buffer
-
-    \param[in] idims the dimensions of the buffer
-    \param[in] buf the OpenCL memory object
-    \param[in] type the data type contained in the buffer
-    \param[in] retain if true, instructs ArrayFire to retain the memory object
-    \returns an array object created from the OpenCL buffer
-
-    \note Set \p retain to true if the memory originates from a cl::Buffer object
-     */
-    static inline af::array array(af::dim4 idims, cl_mem buf, af::dtype type, bool retain=false)
-    {
-        const unsigned ndims = (unsigned)idims.ndims();
-        const dim_t *dims = idims.get();
-
-        cl_context context;
-        cl_int clerr = clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(cl_context), &context, NULL);
-        if (clerr != CL_SUCCESS) {
-            throw af::exception("Failed to get context from cl_mem object \"buf\" ");
-        }
-
-        if (context != getContext()) {
-            throw(af::exception("Context mismatch between input \"buf\" and arrayfire"));
-        }
-
-
-        if (retain) clerr = clRetainMemObject(buf);
-
-        af_array out;
-        af_err err = af_device_array(&out, buf, ndims, dims, type);
-
-        if (err != AF_SUCCESS || clerr != CL_SUCCESS) {
-            if (retain && clerr == CL_SUCCESS) clReleaseMemObject(buf);
-            throw af::exception("Failed to create device array");
-        }
-
-        return af::array(out);
-    }
-
-    /**
-    Create an af::array object from an OpenCL cl_mem buffer
-
-    \param[in] dim0 the length of the first dimension of the buffer
-    \param[in] buf the OpenCL memory object
-    \param[in] type the data type contained in the buffer
-    \param[in] retain if true, instructs ArrayFire to retain the memory object
-    \returns an array object created from the OpenCL buffer
-
-    \note Set \p retain to true if the memory originates from a cl::Buffer object
-     */
-    static inline af::array array(dim_t dim0,
-                                  cl_mem buf, af::dtype type, bool retain=false)
-    {
-        return afcl::array(af::dim4(dim0), buf, type, retain);
-    }
-
-    /**
-    Create an af::array object from an OpenCL cl_mem buffer
-
-    \param[in] dim0 the length of the first dimension of the buffer
-    \param[in] dim1 the length of the second dimension of the buffer
-    \param[in] buf the OpenCL memory object
-    \param[in] type the data type contained in the buffer
-    \param[in] retain if true, instructs ArrayFire to retain the memory object
-    \returns an array object created from the OpenCL buffer
-
-    \note Set \p retain to true if the memory originates from a cl::Buffer object
-     */
-    static inline af::array array(dim_t dim0, dim_t dim1,
-                                  cl_mem buf, af::dtype type, bool retain=false)
-    {
-        return afcl::array(af::dim4(dim0, dim1), buf, type, retain);
-    }
-
-    /**
-    Create an af::array object from an OpenCL cl_mem buffer
-
-    \param[in] dim0 the length of the first dimension of the buffer
-    \param[in] dim1 the length of the second dimension of the buffer
-    \param[in] dim2 the length of the third dimension of the buffer
-    \param[in] buf the OpenCL memory object
-    \param[in] type the data type contained in the buffer
-    \param[in] retain if true, instructs ArrayFire to retain the memory object
-    \returns an array object created from the OpenCL buffer
-
-    \note Set \p retain to true if the memory originates from a cl::Buffer object
-     */
-    static inline af::array array(dim_t dim0, dim_t dim1,
-                                  dim_t dim2,
-                                  cl_mem buf, af::dtype type, bool retain=false)
-    {
-        return afcl::array(af::dim4(dim0, dim1, dim2), buf, type, retain);
-    }
-
-    /**
-    Create an af::array object from an OpenCL cl_mem buffer
-
-    \param[in] dim0 the length of the first dimension of the buffer
-    \param[in] dim1 the length of the second dimension of the buffer
-    \param[in] dim2 the length of the third dimension of the buffer
-    \param[in] dim3 the length of the fourth dimension of the buffer
-    \param[in] buf the OpenCL memory object
-    \param[in] type the data type contained in the buffer
-    \param[in] retain if true, instructs ArrayFire to retain the memory object
-    \returns an array object created from the OpenCL buffer
-
-    \note Set \p retain to true if the memory originates from a cl::Buffer object
-     */
-    static inline af::array array(dim_t dim0, dim_t dim1,
-                                  dim_t dim2, dim_t dim3,
-                                  cl_mem buf, af::dtype type, bool retain=false)
-    {
-        return afcl::array(af::dim4(dim0, dim1, dim2, dim3), buf, type, retain);
-    }
-
-    /**
-      @}
-    */
+
+/**
+
+ */
+ /**
+     \ingroup opencl_mat
+     @{
+ */
+ /**
+ Get a handle to ArrayFire's OpenCL context
+
+ \param[in] retain if true calls clRetainContext prior to returning the context
+ \returns the current context being used by ArrayFire
+
+ \note Set \p retain to true if this value will be passed to a cl::Context constructor
+ */
+ static inline cl_context getContext(bool retain = false)
+ {
+     cl_context ctx;
+     af_err err = afcl_get_context(&ctx, retain);
+     if (err != AF_SUCCESS) throw af::exception("Failed to get OpenCL context from arrayfire");
+     return ctx;
+ }
+
+ /**
+ Get a handle to ArrayFire's OpenCL command queue
+
+ \param[in] retain if true calls clRetainCommandQueue prior to returning the context
+ \returns the current command queue being used by ArrayFire
+
+ \note Set \p retain to true if this value will be passed to a cl::CommandQueue constructor
+ */
+ static inline cl_command_queue getQueue(bool retain = false)
+ {
+     cl_command_queue queue;
+     af_err err = afcl_get_queue(&queue, retain);
+     if (err != AF_SUCCESS) throw af::exception("Failed to get OpenCL command queue from arrayfire");
+     return queue;
+ }
+
+ /**
+    Get the device ID for ArrayFire's current active device
+    \returns the cl_device_id of the current device
+ */
+ static inline cl_device_id getDeviceId()
+ {
+     cl_device_id id;
+     af_err err = afcl_get_device_id(&id);
+     if (err != AF_SUCCESS) throw af::exception("Failed to get OpenCL device ID");
+
+     return id;
+ }
+
+#if AF_API_VERSION >= 32
+ /**
+   Set ArrayFire's active device based on \p id of type cl_device_id
+
+   \param[in] id the cl_device_id of the device to be set as active device
+ */
+ static inline void setDeviceId(cl_device_id id)
+ {
+     af_err err = afcl_set_device_id(id);
+     if (err != AF_SUCCESS) throw af::exception("Failed to set OpenCL device as active device");
+ }
+#endif
+
+ /**
+ Create an af::array object from an OpenCL cl_mem buffer
+
+ \param[in] idims the dimensions of the buffer
+ \param[in] buf the OpenCL memory object
+ \param[in] type the data type contained in the buffer
+ \param[in] retain if true, instructs ArrayFire to retain the memory object
+ \returns an array object created from the OpenCL buffer
+
+ \note Set \p retain to true if the memory originates from a cl::Buffer object
+  */
+ static inline af::array array(af::dim4 idims, cl_mem buf, af::dtype type, bool retain=false)
+ {
+     const unsigned ndims = (unsigned)idims.ndims();
+     const dim_t *dims = idims.get();
+
+     cl_context context;
+     cl_int clerr = clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(cl_context), &context, NULL);
+     if (clerr != CL_SUCCESS) {
+         throw af::exception("Failed to get context from cl_mem object \"buf\" ");
+     }
+
+     if (context != getContext()) {
+         throw(af::exception("Context mismatch between input \"buf\" and arrayfire"));
+     }
+
+
+     if (retain) clerr = clRetainMemObject(buf);
+
+     af_array out;
+     af_err err = af_device_array(&out, buf, ndims, dims, type);
+
+     if (err != AF_SUCCESS || clerr != CL_SUCCESS) {
+         if (retain && clerr == CL_SUCCESS) clReleaseMemObject(buf);
+         throw af::exception("Failed to create device array");
+     }
+
+     return af::array(out);
+ }
+
+ /**
+ Create an af::array object from an OpenCL cl_mem buffer
+
+ \param[in] dim0 the length of the first dimension of the buffer
+ \param[in] buf the OpenCL memory object
+ \param[in] type the data type contained in the buffer
+ \param[in] retain if true, instructs ArrayFire to retain the memory object
+ \returns an array object created from the OpenCL buffer
+
+ \note Set \p retain to true if the memory originates from a cl::Buffer object
+  */
+ static inline af::array array(dim_t dim0,
+                               cl_mem buf, af::dtype type, bool retain=false)
+ {
+     return afcl::array(af::dim4(dim0), buf, type, retain);
+ }
+
+ /**
+ Create an af::array object from an OpenCL cl_mem buffer
+
+ \param[in] dim0 the length of the first dimension of the buffer
+ \param[in] dim1 the length of the second dimension of the buffer
+ \param[in] buf the OpenCL memory object
+ \param[in] type the data type contained in the buffer
+ \param[in] retain if true, instructs ArrayFire to retain the memory object
+ \returns an array object created from the OpenCL buffer
+
+ \note Set \p retain to true if the memory originates from a cl::Buffer object
+  */
+ static inline af::array array(dim_t dim0, dim_t dim1,
+                               cl_mem buf, af::dtype type, bool retain=false)
+ {
+     return afcl::array(af::dim4(dim0, dim1), buf, type, retain);
+ }
+
+ /**
+ Create an af::array object from an OpenCL cl_mem buffer
+
+ \param[in] dim0 the length of the first dimension of the buffer
+ \param[in] dim1 the length of the second dimension of the buffer
+ \param[in] dim2 the length of the third dimension of the buffer
+ \param[in] buf the OpenCL memory object
+ \param[in] type the data type contained in the buffer
+ \param[in] retain if true, instructs ArrayFire to retain the memory object
+ \returns an array object created from the OpenCL buffer
+
+ \note Set \p retain to true if the memory originates from a cl::Buffer object
+  */
+ static inline af::array array(dim_t dim0, dim_t dim1,
+                               dim_t dim2,
+                               cl_mem buf, af::dtype type, bool retain=false)
+ {
+     return afcl::array(af::dim4(dim0, dim1, dim2), buf, type, retain);
+ }
+
+ /**
+ Create an af::array object from an OpenCL cl_mem buffer
+
+ \param[in] dim0 the length of the first dimension of the buffer
+ \param[in] dim1 the length of the second dimension of the buffer
+ \param[in] dim2 the length of the third dimension of the buffer
+ \param[in] dim3 the length of the fourth dimension of the buffer
+ \param[in] buf the OpenCL memory object
+ \param[in] type the data type contained in the buffer
+ \param[in] retain if true, instructs ArrayFire to retain the memory object
+ \returns an array object created from the OpenCL buffer
+
+ \note Set \p retain to true if the memory originates from a cl::Buffer object
+  */
+ static inline af::array array(dim_t dim0, dim_t dim1,
+                               dim_t dim2, dim_t dim3,
+                               cl_mem buf, af::dtype type, bool retain=false)
+ {
+     return afcl::array(af::dim4(dim0, dim1, dim2, dim3), buf, type, retain);
+ }
+
+ /**
+   @}
+ */
+
+}
+
+namespace af
+{
+
+template<> AFAPI cl_mem *array::device() const
+{
+    cl_mem *mem = new cl_mem;
+    af_err err = af_get_device_ptr((void **)mem, get());
+    if (err != AF_SUCCESS) throw af::exception("Failed to get cl_mem from array object");
+    return mem;
 }
 
-namespace af {
-    template<> AFAPI cl_mem *array::device() const
-    {
-        cl_mem *mem = new cl_mem;
-        af_err err = af_get_device_ptr((void **)mem, get());
-        if (err != AF_SUCCESS) throw af::exception("Failed to get cl_mem from array object");
-        return mem;
-    }
 }
 
 #endif
diff --git a/include/af/statistics.h b/include/af/statistics.h
index fd35bc5a86..4d02d4aea0 100644
--- a/include/af/statistics.h
+++ b/include/af/statistics.h
@@ -205,7 +205,7 @@ extern "C" {
    \param[out] out will contain the mean of the input array along dimension \p dim
    \param[in] in is the input array
    \param[in] dim the dimension along which the mean is extracted
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_mean
@@ -219,7 +219,7 @@ AFAPI af_err af_mean(af_array *out, const af_array in, const dim_t dim);
    \param[in] in is the input array
    \param[in] weights is used to scale input \p in before getting mean
    \param[in] dim the dimension along which the mean is extracted
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_mean
@@ -233,7 +233,7 @@ AFAPI af_err af_mean_weighted(af_array *out, const af_array in, const af_array w
    \param[in] in is the input array
    \param[in] isbiased is boolean denoting Population variance (false) or Sample Variance (true)
    \param[in] dim the dimension along which the variance is extracted
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_var
@@ -248,7 +248,7 @@ AFAPI af_err af_var(af_array *out, const af_array in, const bool isbiased, const
    \param[in] in is the input array
    \param[in] weights is used to scale input \p in before getting variance
    \param[in] dim the dimension along which the variance is extracted
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_var
@@ -262,7 +262,7 @@ AFAPI af_err af_var_weighted(af_array *out, const af_array in, const af_array we
    \param[out] out will contain the standard deviation of the input array along dimension \p dim
    \param[in] in is the input array
    \param[in] dim the dimension along which the standard deviation is extracted
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_stdev
@@ -277,7 +277,7 @@ AFAPI af_err af_stdev(af_array *out, const af_array in, const dim_t dim);
    \param[in] X is the first input array
    \param[in] Y is the second input array
    \param[in] isbiased is boolean specifying if biased estimate should be taken (default: false)
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_cov
@@ -290,7 +290,7 @@ AFAPI af_err af_cov(af_array* out, const af_array X, const af_array Y, const boo
    \param[out] out will contain the median of the input array along dimension \p dim
    \param[in] in is the input array
    \param[in] dim the dimension along which the median is extracted
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_median
@@ -303,7 +303,7 @@ AFAPI af_err af_median(af_array* out, const af_array in, const dim_t dim);
    \param[out] real will contain the real part of mean of the entire input array
    \param[out] imag will contain the imaginary part of mean of the entire input array
    \param[in] in is the input array
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_mean
@@ -317,7 +317,7 @@ AFAPI af_err af_mean_all(double *real, double *imag, const af_array in);
    \param[out] imag will contain the imaginary part of mean of the entire weighted input array
    \param[in] in is the input array
    \param[in] weights  is used to scale input \p in before getting mean
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_mean
@@ -332,7 +332,7 @@ AFAPI af_err af_mean_all_weighted(double *real, double *imag, const af_array in,
    \param[out] imagVal will contain the imaginary part of variance of the entire input array
    \param[in] in is the input array
    \param[in] isbiased is boolean denoting Population variance (false) or Sample Variance (true)
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_var
@@ -346,7 +346,7 @@ AFAPI af_err af_var_all(double *realVal, double *imagVal, const af_array in, con
    \param[out] imagVal will contain the imaginary part of variance of the entire weighted input array
    \param[in] in is the input array
    \param[in] weights  is used to scale input \p in before getting variance
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_var
@@ -359,7 +359,7 @@ AFAPI af_err af_var_all_weighted(double *realVal, double *imagVal, const af_arra
    \param[out] real will contain the real part of standard deviation of the entire input array
    \param[out] imag will contain the imaginary part of standard deviation of the entire input array
    \param[in] in is the input array
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_stdev
@@ -372,7 +372,7 @@ AFAPI af_err af_stdev_all(double *real, double *imag, const af_array in);
    \param[out] realVal will contain the real part of median of the entire input array
    \param[out] imagVal will contain the imaginary part of median of the entire input array
    \param[in] in is the input array
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \ingroup stat_func_median
@@ -386,7 +386,7 @@ AFAPI af_err af_median_all(double *realVal, double *imagVal, const af_array in);
    \param[out] imagVal will contain the imaginary part of correlation coefficient of the inputs
    \param[in] X is the first input array
    \param[in] Y is the second input array
-   \return     \ref AF_SUCCESS if the color transformation is successful,
+   \return     \ref AF_SUCCESS if the operation is successful,
    otherwise an appropriate error code is returned.
 
    \note There are many ways correlation coefficient is calculated. This algorithm returns Pearson product-moment correlation coefficient.
diff --git a/include/af/traits.hpp b/include/af/traits.hpp
index 5f7fed381c..29a1a58ea4 100644
--- a/include/af/traits.hpp
+++ b/include/af/traits.hpp
@@ -139,6 +139,30 @@ struct dtype_traits<unsigned long long> {
     static const char* getName() { return "ulong"; }
 };
 
+#if AF_API_VERSION >= 32
+template<>
+struct dtype_traits<short> {
+    enum {
+        af_type = s16 ,
+        ctype = s16
+    };
+    typedef short base_type;
+    static const char* getName() { return "short"; }
+};
+#endif
+
+#if AF_API_VERSION >= 32
+template<>
+struct dtype_traits<unsigned short> {
+    enum {
+        af_type = u16 ,
+        ctype = u16
+    };
+    typedef unsigned short base_type;
+    static const char* getName() { return "ushort"; }
+};
+#endif
+
 }
 
 #endif
diff --git a/include/af/util.h b/include/af/util.h
index 97e939e3e6..c1fd96ab24 100644
--- a/include/af/util.h
+++ b/include/af/util.h
@@ -121,11 +121,11 @@ namespace af
 
 #define af_print(...) GET_PRINT_MACRO(__VA_ARGS__, AF_PRINT2, AF_PRINT1)(__VA_ARGS__)
 
-#else
+#else // AF_API_VERSION
 
 #define af_print(exp) af::print(#exp, exp);
 
-#endif
+#endif // AF_API_VERSION
 
 #endif //__cplusplus
 
diff --git a/include/af/vision.h b/include/af/vision.h
index 470a12d980..78cc107ac5 100644
--- a/include/af/vision.h
+++ b/include/af/vision.h
@@ -39,7 +39,9 @@ class array;
 
     \ingroup cv_func_fast
  */
-AFAPI features fast(const array& in, const float thr=20.0f, const unsigned arc_length=9, const bool non_max=true, const float feature_ratio=0.05, const unsigned edge=3);
+AFAPI features fast(const array& in, const float thr=20.0f, const unsigned arc_length=9,
+                    const bool non_max=true, const float feature_ratio=0.05,
+                    const unsigned edge=3);
 
 #if AF_API_VERSION >= 31
 /**
@@ -68,7 +70,9 @@ AFAPI features fast(const array& in, const float thr=20.0f, const unsigned arc_l
 
     \ingroup cv_func_harris
  */
-AFAPI features harris(const array& in, const unsigned max_corners=500, const float min_response=1e5f, const float sigma=1.f, const unsigned block_size=0, const float k_thr=0.04f);
+AFAPI features harris(const array& in, const unsigned max_corners=500,
+                      const float min_response=1e5f, const float sigma=1.f,
+                      const unsigned block_size=0, const float k_thr=0.04f);
 #endif
 
 /**
@@ -93,7 +97,10 @@ AFAPI features harris(const array& in, const unsigned max_corners=500, const flo
 
     \ingroup cv_func_orb
  */
-AFAPI void orb(features& feat, array& desc, const array& image, const float fast_thr=20.f, const unsigned max_feat=400, const float scl_fctr=1.5f, const unsigned levels=4, const bool blur_img=false);
+AFAPI void orb(features& feat, array& desc, const array& image,
+               const float fast_thr=20.f, const unsigned max_feat=400,
+               const float scl_fctr=1.5f, const unsigned levels=4,
+               const bool blur_img=false);
 
 #if AF_API_VERSION >= 31
 /**
@@ -127,7 +134,48 @@ AFAPI void orb(features& feat, array& desc, const array& image, const float fast
 
     \ingroup cv_func_sift
  */
-AFAPI void sift(features& feat, array& desc, const array& in, const unsigned n_layers=3, const float contrast_thr=0.04f, const float edge_thr=10.f, const float init_sigma=1.6f, const bool double_input=true, const float intensity_scale=0.00390625f, const float feature_ratio=0.05f);
+AFAPI void sift(features& feat, array& desc, const array& in, const unsigned n_layers=3,
+                const float contrast_thr=0.04f, const float edge_thr=10.f,
+                const float init_sigma=1.6f, const bool double_input=true,
+                const float intensity_scale=0.00390625f, const float feature_ratio=0.05f);
+#endif
+
+#if AF_API_VERSION >= 32
+/**
+    C++ Interface for SIFT feature detector and GLOH descriptor
+
+    \param[out] feat features object composed of arrays for x and y
+                coordinates, score, orientation and size of selected features
+    \param[out] desc Nx272 array containing extracted GLOH descriptors, where N
+                is the number of features found by SIFT
+    \param[in]  in array containing a grayscale image (color images are not
+                supported)
+    \param[in]  n_layers number of layers per octave, the number of octaves is
+                computed automatically according to the input image dimensions,
+                the original SIFT paper suggests 3
+    \param[in]  contrast_thr threshold used to filter out features that have
+                low contrast, the original SIFT paper suggests 0.04
+    \param[in]  edge_thr threshold used to filter out features that are too
+                edge-like, the original SIFT paper suggests 10.0
+    \param[in]  init_sigma the sigma value used to filter the input image at
+                the first octave, the original SIFT paper suggests 1.6
+    \param[in]  double_input if true, the input image dimensions will be
+                doubled and the doubled image will be used for the first octave
+    \param[in]  intensity_scale the inverse of the difference between the minimum
+                and maximum grayscale intensity value, e.g.: if the ranges are
+                0-256, the proper intensity_scale value is 1/256, if the ranges
+                are 0-1, the proper intensity-scale value is 1/1
+    \param[in]  feature_ratio maximum ratio of features to detect, the maximum
+                number of features is calculated by feature_ratio * in.elements().
+                The maximum number of features is not based on the score, instead,
+                features detected after the limit is reached are discarded
+
+    \ingroup cv_func_sift
+ */
+AFAPI void gloh(features& feat, array& desc, const array& in, const unsigned n_layers=3,
+                const float contrast_thr=0.04f, const float edge_thr=10.f,
+                const float init_sigma=1.6f, const bool double_input=true,
+                const float intensity_scale=0.00390625f, const float feature_ratio=0.05f);
 #endif
 
 /**
@@ -245,6 +293,37 @@ AFAPI features susan(const array& in,
 AFAPI array dog(const array& in, const int radius1, const int radius2);
 #endif
 
+#if AF_API_VERSION >= 32
+/**
+   C++ Interface for Homography estimation
+
+   \param[out] H is a 3x3 array containing the estimated homography.
+   \param[out] inliers is the number of inliers that the homography was estimated to comprise,
+               in the case that htype is AF_HOMOGRAPHY_RANSAC, a higher inlier_thr value will increase the
+               estimated inliers. Note that if the number of inliers is too low, it is likely
+               that a bad homography will be returned.
+   \param[in]  x_src x coordinates of the source points.
+   \param[in]  y_src y coordinates of the source points.
+   \param[in]  x_dst x coordinates of the destination points.
+   \param[in]  y_dst y coordinates of the destination points.
+   \param[in]  htype can be AF_HOMOGRAPHY_RANSAC, for which a RANdom SAmple Consensus will be
+               used to evaluate the homography quality (e.g., number of inliers), or AF_HOMOGRAPHY_LMEDS,
+               which will use Least Median of Squares method to evaluate homography quality
+   \param[in]  inlier_thr if htype is AF_HOMOGRAPHY_RANSAC, this parameter will five the maximum L2-distance
+               for a point to be considered an inlier.
+   \param[in]  iterations maximum number of iterations when htype is AF_HOMOGRAPHY_RANSAC and backend is CPU,
+               if backend is CUDA or OpenCL, iterations is the total number of iterations, an
+               iteration is a selection of 4 random points for which the homography is estimated
+               and evaluated for number of inliers.
+   \param[in]  otype the array type for the homography output.
+
+   \ingroup cv_func_homography
+*/
+AFAPI void homography(array& H, int& inliers, const array& x_src, const array& y_src,
+                      const array& x_dst, const array& y_dst, const af_homography_type htype=AF_HOMOGRAPHY_RANSAC,
+                      const float inlier_thr=3.f, const unsigned iterations=1000, const dtype otype=f32);
+#endif
+
 }
 #endif
 
@@ -277,7 +356,8 @@ extern "C" {
 
         \ingroup cv_func_fast
     */
-    AFAPI af_err af_fast(af_features *out, const af_array in, const float thr, const unsigned arc_length, const bool non_max, const float feature_ratio, const unsigned edge);
+    AFAPI af_err af_fast(af_features *out, const af_array in, const float thr, const unsigned arc_length,
+                         const bool non_max, const float feature_ratio, const unsigned edge);
 
 #if AF_API_VERSION >= 31
     /**
@@ -306,7 +386,9 @@ extern "C" {
 
         \ingroup cv_func_harris
     */
-    AFAPI af_err af_harris(af_features *out, const af_array in, const unsigned max_corners, const float min_response, const float sigma, const unsigned block_size, const float k_thr);
+    AFAPI af_err af_harris(af_features *out, const af_array in, const unsigned max_corners,
+                           const float min_response, const float sigma,
+                           const unsigned block_size, const float k_thr);
 #endif
 
     /**
@@ -331,7 +413,9 @@ extern "C" {
 
         \ingroup cv_func_orb
     */
-    AFAPI af_err af_orb(af_features *feat, af_array *desc, const af_array in, const float fast_thr, const unsigned max_feat, const float scl_fctr, const unsigned levels, const bool blur_img);
+    AFAPI af_err af_orb(af_features *feat, af_array *desc, const af_array in,
+                        const float fast_thr, const unsigned max_feat, const float scl_fctr,
+                        const unsigned levels, const bool blur_img);
 
 #if AF_API_VERSION >= 31
     /**
@@ -365,7 +449,48 @@ extern "C" {
 
         \ingroup cv_func_sift
     */
-    AFAPI af_err af_sift(af_features *feat, af_array *desc, const af_array in, const unsigned n_layers, const float contrast_thr, const float edge_thr, const float init_sigma, const bool double_input, const float intensity_scale, const float feature_ratio);
+    AFAPI af_err af_sift(af_features *feat, af_array *desc, const af_array in,
+                         const unsigned n_layers, const float contrast_thr, const float edge_thr,
+                         const float init_sigma, const bool double_input,
+                         const float intensity_scale, const float feature_ratio);
+#endif
+
+#if AF_API_VERSION >= 32
+    /**
+        C++ Interface for SIFT feature detector and GLOH descriptor
+
+        \param[out] feat af_features object composed of arrays for x and y
+                    coordinates, score, orientation and size of selected features
+        \param[out] desc Nx272 array containing extracted GLOH descriptors, where N
+                    is the number of features found by SIFT
+        \param[in]  in array containing a grayscale image (color images are not
+                    supported)
+        \param[in]  n_layers number of layers per octave, the number of octaves is
+                    computed automatically according to the input image dimensions,
+                    the original SIFT paper suggests 3
+        \param[in]  contrast_thr threshold used to filter out features that have
+                    low contrast, the original SIFT paper suggests 0.04
+        \param[in]  edge_thr threshold used to filter out features that are too
+                    edge-like, the original SIFT paper suggests 10.0
+        \param[in]  init_sigma the sigma value used to filter the input image at
+                    the first octave, the original SIFT paper suggests 1.6
+        \param[in]  double_input if true, the input image dimensions will be
+                    doubled and the doubled image will be used for the first octave
+        \param[in]  intensity_scale the inverse of the difference between the minimum
+                    and maximum grayscale intensity value, e.g.: if the ranges are
+                    0-256, the proper intensity_scale value is 1/256, if the ranges
+                    are 0-1, the proper intensity-scale value is 1/1
+        \param[in]  feature_ratio maximum ratio of features to detect, the maximum
+                    number of features is calculated by feature_ratio * in.elements().
+                    The maximum number of features is not based on the score, instead,
+                    features detected after the limit is reached are discarded
+
+        \ingroup cv_func_sift
+    */
+    AFAPI af_err af_gloh(af_features *feat, af_array *desc, const af_array in,
+                         const unsigned n_layers, const float contrast_thr,
+                         const float edge_thr, const float init_sigma, const bool double_input,
+                         const float intensity_scale, const float feature_ratio);
 #endif
 
     /**
@@ -441,7 +566,8 @@ extern "C" {
 
        \ingroup cv_func_match_template
     */
-    AFAPI af_err af_match_template(af_array *out, const af_array search_img, const af_array template_img, const af_match_type m_type);
+    AFAPI af_err af_match_template(af_array *out, const af_array search_img,
+                                   const af_array template_img, const af_match_type m_type);
 
 #if AF_API_VERSION >= 31
     /**
@@ -462,7 +588,8 @@ extern "C" {
 
        \ingroup cv_func_susan
     */
-    AFAPI af_err af_susan(af_features* out, const af_array in, const unsigned radius, const float diff_thr, const float geom_thr,
+    AFAPI af_err af_susan(af_features* out, const af_array in, const unsigned radius,
+                          const float diff_thr, const float geom_thr,
                           const float feature_ratio, const unsigned edge);
 #endif
 
@@ -482,6 +609,40 @@ extern "C" {
     AFAPI af_err af_dog(af_array *out, const af_array in, const int radius1, const int radius2);
 #endif
 
+#if AF_API_VERSION >= 32
+    /**
+       C Interface wrapper for Homography estimation
+
+       \param[out] H is a 3x3 array containing the estimated homography.
+       \param[out] inliers is the number of inliers that the homography was estimated to comprise,
+                   in the case that htype is AF_HOMOGRAPHY_RANSAC, a higher inlier_thr value will increase the
+                   estimated inliers. Note that if the number of inliers is too low, it is likely
+                   that a bad homography will be returned.
+       \param[in]  x_src x coordinates of the source points.
+       \param[in]  y_src y coordinates of the source points.
+       \param[in]  x_dst x coordinates of the destination points.
+       \param[in]  y_dst y coordinates of the destination points.
+       \param[in]  htype can be AF_HOMOGRAPHY_RANSAC, for which a RANdom SAmple Consensus will be
+                   used to evaluate the homography quality (e.g., number of inliers), or AF_HOMOGRAPHY_LMEDS,
+                   which will use Least Median of Squares method to evaluate homography quality.
+       \param[in]  inlier_thr if htype is AF_HOMOGRAPHY_RANSAC, this parameter will five the maximum L2-distance
+                   for a point to be considered an inlier.
+       \param[in]  iterations maximum number of iterations when htype is AF_HOMOGRAPHY_RANSAC and backend is CPU,
+                   if backend is CUDA or OpenCL, iterations is the total number of iterations, an
+                   iteration is a selection of 4 random points for which the homography is estimated
+                   and evaluated for number of inliers.
+       \param[in]  otype the array type for the homography output.
+       \return     \ref AF_SUCCESS if the computation is is successful,
+                   otherwise an appropriate error code is returned.
+
+       \ingroup cv_func_homography
+     */
+    AFAPI af_err af_homography(af_array *H, int *inliers, const af_array x_src, const af_array y_src,
+                               const af_array x_dst, const af_array y_dst,
+                               const af_homography_type htype, const float inlier_thr,
+                               const unsigned iterations, const af_dtype otype);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/arrayfire.h b/include/arrayfire.h
index 56518b2c86..e4ac1bbb71 100644
--- a/include/arrayfire.h
+++ b/include/arrayfire.h
@@ -200,6 +200,13 @@
      Reading and writing images
    @}
 
+   @defgroup unified_func Unified API Functions
+   @{
+
+     Functions to set current backend and utilities
+
+   @}
+
    @defgroup external Interface Functions
    @{
 
@@ -262,12 +269,16 @@
 \example histogram.cpp
 \example fractal.cpp
 \example plot2d.cpp
+\example plot3.cpp
+\example surface.cpp
 \example conway_pretty.cpp
+\example basic.cpp
 \example helloworld.cpp
 \example vectorize.cpp
 \example integer.cpp
 \example convolve.cpp
 \example rainfall.cpp
+\example swe.cpp
 \example morphing.cpp
 \example image_demo.cpp
 \example brain_segmentation.cpp
@@ -284,6 +295,7 @@
 #include "af/algorithm.h"
 #include "af/arith.h"
 #include "af/array.h"
+#include "af/backend.h"
 #include "af/blas.h"
 #include "af/constants.h"
 #include "af/complex.h"
diff --git a/src/api/c/approx.cpp b/src/api/c/approx.cpp
index 1bc7723fdf..7c2935ac1b 100644
--- a/src/api/c/approx.cpp
+++ b/src/api/c/approx.cpp
@@ -41,13 +41,18 @@ af_err af_approx1(af_array *out, const af_array in, const af_array pos,
         ArrayInfo i_info = getInfo(in);
         ArrayInfo p_info = getInfo(pos);
 
+        dim4 idims = i_info.dims();
+        dim4 pdims = p_info.dims();
+
         af_dtype itype = i_info.getType();
 
         ARG_ASSERT(1, i_info.isFloating());                       // Only floating and complex types
         ARG_ASSERT(2, p_info.isRealFloating());                   // Only floating types
         ARG_ASSERT(1, i_info.isSingle() == p_info.isSingle());    // Must have same precision
         ARG_ASSERT(1, i_info.isDouble() == p_info.isDouble());    // Must have same precision
-        DIM_ASSERT(2, p_info.isColumn());                         // Only 1D input allowed
+        // POS should either be (x, 1, 1, 1) or (1, idims[1], idims[2], idims[3])
+        DIM_ASSERT(2, p_info.isColumn() ||
+                      (pdims[1] == idims[1] && pdims[2] == idims[2] && pdims[3] == idims[3]));
         ARG_ASSERT(3, (method == AF_INTERP_LINEAR || method == AF_INTERP_NEAREST));
 
         af_array output;
@@ -74,16 +79,23 @@ af_err af_approx2(af_array *out, const af_array in, const af_array pos0, const a
         ArrayInfo p_info = getInfo(pos0);
         ArrayInfo q_info = getInfo(pos1);
 
+        dim4 idims = i_info.dims();
+        dim4 pdims = p_info.dims();
+        dim4 qdims = q_info.dims();
+
         af_dtype itype = i_info.getType();
 
-        ARG_ASSERT(1, i_info.isFloating());                       // Only floating and complex types
-        ARG_ASSERT(2, p_info.isRealFloating());                   // Only floating types
-        ARG_ASSERT(3, q_info.isRealFloating());                   // Only floating types
-        ARG_ASSERT(1, p_info.getType() == q_info.getType());      // Must have same type
-        ARG_ASSERT(1, i_info.isSingle() == p_info.isSingle());    // Must have same precision
-        ARG_ASSERT(1, i_info.isDouble() == p_info.isDouble());    // Must have same precision
-        DIM_ASSERT(2, p_info.dims() == q_info.dims());            // POS0 and POS1 must have same dims
-        DIM_ASSERT(2, p_info.ndims() < 3);// Allowing input batch but not positions. Output dims = (px, py, iz, iw)
+        ARG_ASSERT(1, i_info.isFloating());                     // Only floating and complex types
+        ARG_ASSERT(2, p_info.isRealFloating());                 // Only floating types
+        ARG_ASSERT(3, q_info.isRealFloating());                 // Only floating types
+        ARG_ASSERT(1, p_info.getType() == q_info.getType());    // Must have same type
+        ARG_ASSERT(1, i_info.isSingle() == p_info.isSingle());  // Must have same precision
+        ARG_ASSERT(1, i_info.isDouble() == p_info.isDouble());  // Must have same precision
+        DIM_ASSERT(2, pdims == qdims);                          // POS0 and POS1 must have same dims
+
+        // POS should either be (x, y, 1, 1) or (x, y, idims[2], idims[3])
+        DIM_ASSERT(2, (pdims[2] == 1        && pdims[3] == 1) ||
+                      (pdims[2] == idims[2] && pdims[3] == idims[3]));
         ARG_ASSERT(3, (method == AF_INTERP_LINEAR || method == AF_INTERP_NEAREST));
 
         af_array output;
diff --git a/src/api/c/assign.cpp b/src/api/c/assign.cpp
index c990889666..13fa179da8 100644
--- a/src/api/c/assign.cpp
+++ b/src/api/c/assign.cpp
@@ -105,6 +105,8 @@ void assign_helper(Array<T> &out, const unsigned &ndims, const af_seq *index, co
             case u32: assign<T, uint   >(out, ndims, index, getArray<uint     >(in_));  break;
             case s64: assign<T, intl   >(out, ndims, index, getArray<intl     >(in_));  break;
             case u64: assign<T, uintl  >(out, ndims, index, getArray<uintl    >(in_));  break;
+            case s16: assign<T, short  >(out, ndims, index, getArray<short    >(in_));  break;
+            case u16: assign<T, ushort >(out, ndims, index, getArray<ushort   >(in_));  break;
             case u8 : assign<T, uchar  >(out, ndims, index, getArray<uchar    >(in_));  break;
             case b8 : assign<T, char   >(out, ndims, index, getArray<char     >(in_));  break;
             default : TYPE_ERROR(1, iType); break;
@@ -165,6 +167,8 @@ af_err af_assign_seq(af_array *out,
                 case u32: assign_helper<uint   >(getWritableArray<uint   >(res), ndims, index, rhs);  break;
                 case s64: assign_helper<intl   >(getWritableArray<intl   >(res), ndims, index, rhs);  break;
                 case u64: assign_helper<uintl  >(getWritableArray<uintl  >(res), ndims, index, rhs);  break;
+                case s16: assign_helper<short  >(getWritableArray<short  >(res), ndims, index, rhs);  break;
+                case u16: assign_helper<ushort >(getWritableArray<ushort >(res), ndims, index, rhs);  break;
                 case u8 : assign_helper<uchar  >(getWritableArray<uchar  >(res), ndims, index, rhs);  break;
                 case b8 : assign_helper<char   >(getWritableArray<char   >(res), ndims, index, rhs);  break;
                 default : TYPE_ERROR(1, oType); break;
@@ -332,6 +336,8 @@ af_err af_assign_gen(af_array *out,
                 case u32: genAssign<uint   >(output, idxrs, rhs); break;
                 case s64: genAssign<intl   >(output, idxrs, rhs); break;
                 case s32: genAssign<int    >(output, idxrs, rhs); break;
+                case s16: genAssign<short  >(output, idxrs, rhs); break;
+                case u16: genAssign<ushort >(output, idxrs, rhs); break;
                 case  u8: genAssign<uchar  >(output, idxrs, rhs); break;
                 case  b8: genAssign<char   >(output, idxrs, rhs); break;
                 default: TYPE_ERROR(1, rhsType);
diff --git a/src/api/c/bilateral.cpp b/src/api/c/bilateral.cpp
index c83c7ef8db..4f9281d782 100644
--- a/src/api/c/bilateral.cpp
+++ b/src/api/c/bilateral.cpp
@@ -42,6 +42,8 @@ static af_err bilateral(af_array *out, const af_array &in, const float &s_sigma,
             case s32: output = bilateral<int   ,  float, isColor> (in, s_sigma, c_sigma); break;
             case u32: output = bilateral<uint  ,  float, isColor> (in, s_sigma, c_sigma); break;
             case u8 : output = bilateral<uchar ,  float, isColor> (in, s_sigma, c_sigma); break;
+            case s16: output = bilateral<short ,  float, isColor> (in, s_sigma, c_sigma); break;
+            case u16: output = bilateral<ushort,  float, isColor> (in, s_sigma, c_sigma); break;
             default : TYPE_ERROR(1, type);
         }
         std::swap(*out,output);
diff --git a/src/api/c/binary.cpp b/src/api/c/binary.cpp
index 8a6ae465a6..2997c13692 100644
--- a/src/api/c/binary.cpp
+++ b/src/api/c/binary.cpp
@@ -55,6 +55,8 @@ static af_err af_arith(af_array *out, const af_array lhs, const af_array rhs, co
         case b8 : res = arithOp<char   , op>(lhs, rhs, odims); break;
         case s64: res = arithOp<intl   , op>(lhs, rhs, odims); break;
         case u64: res = arithOp<uintl  , op>(lhs, rhs, odims); break;
+        case s16: res = arithOp<short  , op>(lhs, rhs, odims); break;
+        case u16: res = arithOp<ushort , op>(lhs, rhs, odims); break;
         default: TYPE_ERROR(0, otype);
         }
 
@@ -85,6 +87,8 @@ static af_err af_arith_real(af_array *out, const af_array lhs, const af_array rh
         case b8 : res = arithOp<char   , op>(lhs, rhs, odims); break;
         case s64: res = arithOp<intl   , op>(lhs, rhs, odims); break;
         case u64: res = arithOp<uintl  , op>(lhs, rhs, odims); break;
+        case s16: res = arithOp<short  , op>(lhs, rhs, odims); break;
+        case u16: res = arithOp<ushort , op>(lhs, rhs, odims); break;
         default: TYPE_ERROR(0, otype);
         }
 
@@ -260,6 +264,8 @@ static af_err af_logic(af_array *out, const af_array lhs, const af_array rhs, co
         case b8 : res = logicOp<char   , op>(lhs, rhs, odims); break;
         case s64: res = logicOp<intl   , op>(lhs, rhs, odims); break;
         case u64: res = logicOp<uintl  , op>(lhs, rhs, odims); break;
+        case s16: res = logicOp<short  , op>(lhs, rhs, odims); break;
+        case u16: res = logicOp<ushort , op>(lhs, rhs, odims); break;
         default: TYPE_ERROR(0, type);
         }
 
@@ -335,6 +341,8 @@ static af_err af_bitwise(af_array *out, const af_array lhs, const af_array rhs,
         case b8 : res = bitOp<char   , op>(lhs, rhs, odims); break;
         case s64: res = bitOp<intl   , op>(lhs, rhs, odims); break;
         case u64: res = bitOp<uintl  , op>(lhs, rhs, odims); break;
+        case s16: res = bitOp<short  , op>(lhs, rhs, odims); break;
+        case u16: res = bitOp<ushort , op>(lhs, rhs, odims); break;
         default: TYPE_ERROR(0, type);
         }
 
diff --git a/src/api/c/cast.cpp b/src/api/c/cast.cpp
index 379b2df91b..872ace27c5 100644
--- a/src/api/c/cast.cpp
+++ b/src/api/c/cast.cpp
@@ -39,6 +39,8 @@ static af_array cast(const af_array in, const af_dtype type)
     case b8 : return getHandle(castArray<char    >(in));
     case s64: return getHandle(castArray<intl    >(in));
     case u64: return getHandle(castArray<uintl   >(in));
+    case s16: return getHandle(castArray<short   >(in));
+    case u16: return getHandle(castArray<ushort  >(in));
     default: TYPE_ERROR(2, type);
     }
 }
diff --git a/src/api/c/convolve.cpp b/src/api/c/convolve.cpp
index 912d8fd0a0..750552db88 100644
--- a/src/api/c/convolve.cpp
+++ b/src/api/c/convolve.cpp
@@ -85,6 +85,10 @@ af_err convolve(af_array *out, const af_array signal, const af_array filter)
             case f64: output = convolve<double ,  double, baseDim, expand>(signal, filter, convBT); break;
             case u32: output = convolve<uint   ,   float, baseDim, expand>(signal, filter, convBT); break;
             case s32: output = convolve<int    ,   float, baseDim, expand>(signal, filter, convBT); break;
+            case u16: output = convolve<ushort ,   float, baseDim, expand>(signal, filter, convBT); break;
+            case s16: output = convolve<short  ,   float, baseDim, expand>(signal, filter, convBT); break;
+            case u64: output = convolve<uintl  ,   float, baseDim, expand>(signal, filter, convBT); break;
+            case s64: output = convolve<intl   ,   float, baseDim, expand>(signal, filter, convBT); break;
             case u8:  output = convolve<uchar  ,   float, baseDim, expand>(signal, filter, convBT); break;
             case b8:  output = convolve<char   ,   float, baseDim, expand>(signal, filter, convBT); break;
             default: TYPE_ERROR(1, stype);
@@ -120,6 +124,10 @@ af_err convolve2_sep(af_array *out, af_array col_filter, af_array row_filter, co
             case f64: output = convolve2<double ,  double, expand>(signal, col_filter, row_filter); break;
             case u32: output = convolve2<uint   ,   float, expand>(signal, col_filter, row_filter); break;
             case s32: output = convolve2<int    ,   float, expand>(signal, col_filter, row_filter); break;
+            case u16: output = convolve2<ushort ,   float, expand>(signal, col_filter, row_filter); break;
+            case s16: output = convolve2<short  ,   float, expand>(signal, col_filter, row_filter); break;
+            case u64: output = convolve2<uintl  ,   float, expand>(signal, col_filter, row_filter); break;
+            case s64: output = convolve2<intl   ,   float, expand>(signal, col_filter, row_filter); break;
             case u8:  output = convolve2<uchar  ,   float, expand>(signal, col_filter, row_filter); break;
             case b8:  output = convolve2<char   ,   float, expand>(signal, col_filter, row_filter); break;
             default: TYPE_ERROR(1, signalType);
diff --git a/src/api/c/corrcoef.cpp b/src/api/c/corrcoef.cpp
index d6d98006a9..275fa80239 100644
--- a/src/api/c/corrcoef.cpp
+++ b/src/api/c/corrcoef.cpp
@@ -71,6 +71,8 @@ af_err af_corrcoef(double *realVal, double *imagVal, const af_array X, const af_
             case u32: *realVal = corrcoef<uint  , float >(X, Y); break;
             case s64: *realVal = corrcoef<intl  , double>(X, Y); break;
             case u64: *realVal = corrcoef<uintl , double>(X, Y); break;
+            case s16: *realVal = corrcoef<short , float >(X, Y); break;
+            case u16: *realVal = corrcoef<ushort, float >(X, Y); break;
             case  u8: *realVal = corrcoef<uchar , float >(X, Y); break;
             case  b8: *realVal = corrcoef<char  , float >(X, Y); break;
             default : TYPE_ERROR(1, xType);
diff --git a/src/api/c/covariance.cpp b/src/api/c/covariance.cpp
index 80b391d1a7..f8bb9c4435 100644
--- a/src/api/c/covariance.cpp
+++ b/src/api/c/covariance.cpp
@@ -27,14 +27,16 @@ using namespace detail;
 template<typename T, typename cType>
 static af_array cov(const af_array& X, const af_array& Y, const bool isbiased)
 {
-    Array<cType> xArr = cast<cType>(getArray<T>(X));
-    Array<cType> yArr = cast<cType>(getArray<T>(Y));
+    Array<T> _x = getArray<T>(X);
+    Array<T> _y = getArray<T>(Y);
+    Array<cType> xArr = cast<cType>(_x);
+    Array<cType> yArr = cast<cType>(_y);
 
     dim4 xDims = xArr.dims();
     dim_t N = isbiased ? xDims[0] : xDims[0]-1;
 
-    Array<cType> xmArr = createValueArray<cType>(xDims, mean<cType>(xArr));
-    Array<cType> ymArr = createValueArray<cType>(xDims, mean<cType>(yArr));
+    Array<cType> xmArr = createValueArray<cType>(xDims, mean<T, cType>(_x));
+    Array<cType> ymArr = createValueArray<cType>(xDims, mean<T, cType>(_y));
     Array<cType> nArr  = createValueArray<cType>(xDims, scalar<cType>(N));
 
     Array<cType> diffX = detail::arithOp<cType, af_sub_t>(xArr, xmArr, xDims);
@@ -71,6 +73,8 @@ af_err af_cov(af_array* out, const af_array X, const af_array Y, const bool isbi
             case u32: output = cov<uint  , float >(X, Y, isbiased); break;
             case s64: output = cov<intl  , double>(X, Y, isbiased); break;
             case u64: output = cov<uintl , double>(X, Y, isbiased); break;
+            case s16: output = cov<short , float >(X, Y, isbiased); break;
+            case u16: output = cov<ushort, float >(X, Y, isbiased); break;
             case  u8: output = cov<uchar , float >(X, Y, isbiased); break;
             default : TYPE_ERROR(1, xType);
         }
diff --git a/src/api/c/data.cpp b/src/api/c/data.cpp
index 50acaad1d3..4d77fb279e 100644
--- a/src/api/c/data.cpp
+++ b/src/api/c/data.cpp
@@ -59,6 +59,8 @@ af_err af_get_data_ptr(void *data, const af_array arr)
         case u8:    copyData(static_cast<uchar    *>(data), arr);  break;
         case s64:   copyData(static_cast<intl     *>(data), arr);  break;
         case u64:   copyData(static_cast<uintl    *>(data), arr);  break;
+        case s16:   copyData(static_cast<short    *>(data), arr);  break;
+        case u16:   copyData(static_cast<ushort   *>(data), arr);  break;
         default:    TYPE_ERROR(1, type);
         }
     }
@@ -88,6 +90,8 @@ af_err af_create_array(af_array *result, const void * const data,
         case u8:    out = createHandleFromData(d, static_cast<const uchar   *>(data)); break;
         case s64:   out = createHandleFromData(d, static_cast<const intl    *>(data)); break;
         case u64:   out = createHandleFromData(d, static_cast<const uintl   *>(data)); break;
+        case s16:   out = createHandleFromData(d, static_cast<const short   *>(data)); break;
+        case u16:   out = createHandleFromData(d, static_cast<const ushort  *>(data)); break;
         default:    TYPE_ERROR(4, type);
         }
         std::swap(*result, out);
@@ -118,6 +122,8 @@ af_err af_constant(af_array *result, const double value,
         case u8:    out = createHandleFromValue<uchar  >(d, value); break;
         case s64:   out = createHandleFromValue<intl   >(d, value); break;
         case u64:   out = createHandleFromValue<uintl  >(d, value); break;
+        case s16:   out = createHandleFromValue<short  >(d, value); break;
+        case u16:   out = createHandleFromValue<ushort >(d, value); break;
         default:    TYPE_ERROR(4, type);
         }
         std::swap(*result, out);
@@ -212,6 +218,8 @@ af_err af_create_handle(af_array *result, const unsigned ndims, const dim_t * co
         case u8:    out = createHandle<uchar  >(d); break;
         case s64:   out = createHandle<intl   >(d); break;
         case u64:   out = createHandle<uintl  >(d); break;
+        case s16:   out = createHandle<short  >(d); break;
+        case u16:   out = createHandle<ushort >(d); break;
         default:    TYPE_ERROR(3, type);
         }
         std::swap(*result, out);
@@ -239,6 +247,8 @@ af_err af_copy_array(af_array *out, const af_array in)
         case u8:    res = copyArray<uchar   >(in); break;
         case s64:   res = copyArray<intl    >(in); break;
         case u64:   res = copyArray<uintl   >(in); break;
+        case s16:   res = copyArray<short   >(in); break;
+        case u16:   res = copyArray<ushort  >(in); break;
         default:    TYPE_ERROR(1, type);
         }
         std::swap(*out, res);
@@ -266,6 +276,8 @@ af_err af_get_data_ref_count(int *use_count, const af_array in)
         case u8:    res = getArray<uchar   >(in).useCount(); break;
         case s64:   res = getArray<intl    >(in).useCount(); break;
         case u64:   res = getArray<uintl   >(in).useCount(); break;
+        case s16:   res = getArray<short   >(in).useCount(); break;
+        case u16:   res = getArray<ushort  >(in).useCount(); break;
         default:    TYPE_ERROR(1, type);
         }
         std::swap(*use_count, res);
@@ -310,6 +322,8 @@ af_err af_randu(af_array *out, const unsigned ndims, const dim_t * const dims, c
         case u32:   result = randu_<uint   >(d);    break;
         case s64:   result = randu_<intl   >(d);    break;
         case u64:   result = randu_<uintl  >(d);    break;
+        case s16:   result = randu_<short  >(d);    break;
+        case u16:   result = randu_<ushort >(d);    break;
         case u8:    result = randu_<uchar  >(d);    break;
         case b8:    result = randu_<char  >(d);    break;
         default:    TYPE_ERROR(3, type);
@@ -375,6 +389,8 @@ af_err af_identity(af_array *out, const unsigned ndims, const dim_t * const dims
         case u8:    result = identity_<uchar  >(d);    break;
         case u64:   result = identity_<uintl  >(d);    break;
         case s64:   result = identity_<intl   >(d);    break;
+        case u16:   result = identity_<ushort >(d);    break;
+        case s16:   result = identity_<short  >(d);    break;
             // Removed because of bool type. Functions implementations exist.
         case b8:    result = identity_<char   >(d);    break;
         default:    TYPE_ERROR(3, type);
@@ -401,6 +417,8 @@ af_err af_release_array(af_array arr)
         case u8:    releaseHandle<uchar   >(arr); break;
         case s64:   releaseHandle<intl    >(arr); break;
         case u64:   releaseHandle<uintl   >(arr); break;
+        case s16:   releaseHandle<short   >(arr); break;
+        case u16:   releaseHandle<ushort  >(arr); break;
         default:    TYPE_ERROR(0, type);
         }
     }
@@ -433,6 +451,8 @@ af_array retain(const af_array in)
     case b8:  return retainHandle<char            >(in);
     case s64: return retainHandle<intl            >(in);
     case u64: return retainHandle<uintl           >(in);
+    case s16: return retainHandle<short           >(in);
+    case u16: return retainHandle<ushort          >(in);
     default:
         TYPE_ERROR(1, ty);
     }
@@ -470,6 +490,8 @@ af_err af_range(af_array *result, const unsigned ndims, const dim_t * const dims
         case u32:   out = range_<uint   >(d, seq_dim); break;
         case s64:   out = range_<intl   >(d, seq_dim); break;
         case u64:   out = range_<uintl  >(d, seq_dim); break;
+        case s16:   out = range_<short  >(d, seq_dim); break;
+        case u16:   out = range_<ushort >(d, seq_dim); break;
         case u8:    out = range_<uchar  >(d, seq_dim); break;
         default:    TYPE_ERROR(4, type);
         }
@@ -495,16 +517,9 @@ af_err af_iota(af_array *result, const unsigned ndims, const dim_t * const dims,
 
         DIM_ASSERT(1, ndims > 0 && ndims <= 4);
         DIM_ASSERT(3, t_ndims > 0 && t_ndims <= 4);
-        dim4 d;
-        dim4 t;
-        for(unsigned i = 0; i < 4; i++) {
-            d[i] = dims[i];
-            DIM_ASSERT(2, d[i] >= 1);
-        }
-        for(unsigned i = 0; i < 4; i++) {
-            t[i] = tdims[i];
-            DIM_ASSERT(4, t[i] >= 1);
-        }
+
+        dim4 d = verifyDims(ndims, dims);
+        dim4 t = verifyDims(t_ndims, tdims);
 
         switch(type) {
         case f32:   out = iota_<float  >(d, t); break;
@@ -513,6 +528,8 @@ af_err af_iota(af_array *result, const unsigned ndims, const dim_t * const dims,
         case u32:   out = iota_<uint   >(d, t); break;
         case s64:   out = iota_<intl   >(d, t); break;
         case u64:   out = iota_<uintl  >(d, t); break;
+        case s16:   out = iota_<short  >(d, t); break;
+        case u16:   out = iota_<ushort >(d, t); break;
         case u8:    out = iota_<uchar  >(d, t); break;
         default:    TYPE_ERROR(4, type);
         }
@@ -596,6 +613,8 @@ af_err af_eval(af_array arr)
         case b8 : eval<char   >(arr); break;
         case s64: eval<intl   >(arr); break;
         case u64: eval<uintl  >(arr); break;
+        case s16: eval<short  >(arr); break;
+        case u16: eval<ushort >(arr); break;
         default:
             TYPE_ERROR(0, type);
         }
@@ -633,6 +652,8 @@ af_err af_diag_create(af_array *out, const af_array in, const int num)
         case u32:   result = diagCreate<uint   >(in, num);    break;
         case s64:   result = diagCreate<intl   >(in, num);    break;
         case u64:   result = diagCreate<uintl  >(in, num);    break;
+        case s16:   result = diagCreate<short  >(in, num);    break;
+        case u16:   result = diagCreate<ushort >(in, num);    break;
         case u8:    result = diagCreate<uchar  >(in, num);    break;
             // Removed because of bool type. Functions implementations exist.
         case b8:    result = diagCreate<char   >(in, num);    break;
@@ -662,6 +683,8 @@ af_err af_diag_extract(af_array *out, const af_array in, const int num)
         case u32:   result = diagExtract<uint   >(in, num);    break;
         case s64:   result = diagExtract<intl   >(in, num);    break;
         case u64:   result = diagExtract<uintl  >(in, num);    break;
+        case s16:   result = diagExtract<short  >(in, num);    break;
+        case u16:   result = diagExtract<ushort >(in, num);    break;
         case u8:    result = diagExtract<uchar  >(in, num);    break;
             // Removed because of bool type. Functions implementations exist.
         case b8:    result = diagExtract<char   >(in, num);    break;
@@ -702,6 +725,8 @@ af_err af_write_array(af_array arr, const void *data, const size_t bytes, af_sou
         case u8:    write_array(arr, static_cast<const uchar   *>(data), bytes, src); break;
         case s64:   write_array(arr, static_cast<const intl    *>(data), bytes, src); break;
         case u64:   write_array(arr, static_cast<const uintl   *>(data), bytes, src); break;
+        case s16:   write_array(arr, static_cast<const short   *>(data), bytes, src); break;
+        case u16:   write_array(arr, static_cast<const ushort  *>(data), bytes, src); break;
         default:    TYPE_ERROR(4, type);
         }
     }
@@ -729,9 +754,11 @@ af_err af_lower(af_array *out, const af_array in, bool is_unit_diag)
         case c32: res = triangle<cfloat  , false>(in, is_unit_diag); break;
         case c64: res = triangle<cdouble , false>(in, is_unit_diag); break;
         case s32: res = triangle<int     , false>(in, is_unit_diag); break;
-        case s64: res = triangle<intl    , false>(in, is_unit_diag); break;
         case u32: res = triangle<uint    , false>(in, is_unit_diag); break;
+        case s64: res = triangle<intl    , false>(in, is_unit_diag); break;
         case u64: res = triangle<uintl   , false>(in, is_unit_diag); break;
+        case s16: res = triangle<short   , false>(in, is_unit_diag); break;
+        case u16: res = triangle<ushort  , false>(in, is_unit_diag); break;
         case u8 : res = triangle<uchar   , false>(in, is_unit_diag); break;
         case b8 : res = triangle<char    , false>(in, is_unit_diag); break;
         }
@@ -753,9 +780,11 @@ af_err af_upper(af_array *out, const af_array in, bool is_unit_diag)
         case c32: res = triangle<cfloat  , true>(in, is_unit_diag); break;
         case c64: res = triangle<cdouble , true>(in, is_unit_diag); break;
         case s32: res = triangle<int     , true>(in, is_unit_diag); break;
-        case s64: res = triangle<intl    , true>(in, is_unit_diag); break;
         case u32: res = triangle<uint    , true>(in, is_unit_diag); break;
+        case s64: res = triangle<intl    , true>(in, is_unit_diag); break;
         case u64: res = triangle<uintl   , true>(in, is_unit_diag); break;
+        case s16: res = triangle<short   , true>(in, is_unit_diag); break;
+        case u16: res = triangle<ushort  , true>(in, is_unit_diag); break;
         case u8 : res = triangle<uchar   , true>(in, is_unit_diag); break;
         case b8 : res = triangle<char    , true>(in, is_unit_diag); break;
         }
diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp
index 23fcdd0ce2..28b4cc2c49 100644
--- a/src/api/c/device.cpp
+++ b/src/api/c/device.cpp
@@ -10,6 +10,7 @@
 #include <af/dim4.hpp>
 #include <af/device.h>
 #include <af/version.h>
+#include <af/backend.h>
 #include <backend.hpp>
 #include <platform.hpp>
 #include <Array.hpp>
@@ -19,6 +20,38 @@
 
 using namespace detail;
 
+af_err af_set_backend(const af_backend bknd)
+{
+    try {
+        ARG_ASSERT(0, bknd==getBackend());
+    }
+    CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+af_err af_get_backend_count(unsigned* num_backends)
+{
+    *num_backends = 1;
+    return AF_SUCCESS;
+}
+
+af_err af_get_available_backends(int* result)
+{
+    *result = getBackend();
+    return AF_SUCCESS;
+}
+
+af_err af_get_backend_id(af_backend *result, const af_array in)
+{
+    try {
+        ARG_ASSERT(1, in != 0);
+        ArrayInfo info = getInfo(in);
+        *result = info.getBackendId();
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
 af_err af_init()
 {
     try {
@@ -107,9 +140,12 @@ af_err af_device_array(af_array *arr, const void *data,
         AF_CHECK(af_init());
 
         af_array res;
-        af::dim4 d((size_t)dims[0]);
-        for(unsigned i = 1; i < ndims; i++) {
+
+        DIM_ASSERT(1, ndims >= 1);
+        dim4 d(1, 1, 1, 1);
+        for(unsigned i = 0; i < ndims; i++) {
             d[i] = dims[i];
+            DIM_ASSERT(3, dims[i] >= 1);
         }
 
         switch (type) {
@@ -121,6 +157,8 @@ af_err af_device_array(af_array *arr, const void *data,
         case u32: res = getHandle(createDeviceDataArray<uint   >(d, data)); break;
         case s64: res = getHandle(createDeviceDataArray<intl   >(d, data)); break;
         case u64: res = getHandle(createDeviceDataArray<uintl  >(d, data)); break;
+        case s16: res = getHandle(createDeviceDataArray<short  >(d, data)); break;
+        case u16: res = getHandle(createDeviceDataArray<ushort >(d, data)); break;
         case u8 : res = getHandle(createDeviceDataArray<uchar  >(d, data)); break;
         case b8 : res = getHandle(createDeviceDataArray<char   >(d, data)); break;
         default: TYPE_ERROR(4, type);
@@ -135,10 +173,6 @@ af_err af_device_array(af_array *arr, const void *data,
 af_err af_get_device_ptr(void **data, const af_array arr)
 {
     try {
-
-        // Make sure all kernels and memcopies are done before getting device pointer
-        detail::sync(getActiveDeviceId());
-
         af_dtype type = getInfo(arr).getType();
 
         switch (type) {
@@ -151,6 +185,8 @@ af_err af_get_device_ptr(void **data, const af_array arr)
         case u32: *data = getDevicePtr(getArray<uint   >(arr)); break;
         case s64: *data = getDevicePtr(getArray<intl   >(arr)); break;
         case u64: *data = getDevicePtr(getArray<uintl  >(arr)); break;
+        case s16: *data = getDevicePtr(getArray<short  >(arr)); break;
+        case u16: *data = getDevicePtr(getArray<ushort >(arr)); break;
         case u8 : *data = getDevicePtr(getArray<uchar  >(arr)); break;
         case b8 : *data = getDevicePtr(getArray<char   >(arr)); break;
 
@@ -171,10 +207,6 @@ inline void lockDevicePtr(const af_array arr)
 af_err af_lock_device_ptr(const af_array arr)
 {
     try {
-
-        // Make sure all kernels and memcopies are done before getting device pointer
-        detail::sync(getActiveDeviceId());
-
         af_dtype type = getInfo(arr).getType();
 
         switch (type) {
@@ -186,6 +218,8 @@ af_err af_lock_device_ptr(const af_array arr)
         case u32: lockDevicePtr<uint   >(arr); break;
         case s64: lockDevicePtr<intl   >(arr); break;
         case u64: lockDevicePtr<uintl  >(arr); break;
+        case s16: lockDevicePtr<short  >(arr); break;
+        case u16: lockDevicePtr<ushort >(arr); break;
         case u8 : lockDevicePtr<uchar  >(arr); break;
         case b8 : lockDevicePtr<char   >(arr); break;
         default: TYPE_ERROR(4, type);
@@ -205,10 +239,6 @@ inline void unlockDevicePtr(const af_array arr)
 af_err af_unlock_device_ptr(const af_array arr)
 {
     try {
-
-        // Make sure all kernels and memcopies are done before getting device pointer
-        detail::sync(getActiveDeviceId());
-
         af_dtype type = getInfo(arr).getType();
 
         switch (type) {
@@ -220,6 +250,8 @@ af_err af_unlock_device_ptr(const af_array arr)
         case u32: unlockDevicePtr<uint   >(arr); break;
         case s64: unlockDevicePtr<intl   >(arr); break;
         case u64: unlockDevicePtr<uintl  >(arr); break;
+        case s16: unlockDevicePtr<short  >(arr); break;
+        case u16: unlockDevicePtr<ushort >(arr); break;
         case u8 : unlockDevicePtr<uchar  >(arr); break;
         case b8 : unlockDevicePtr<char   >(arr); break;
         default: TYPE_ERROR(4, type);
diff --git a/src/api/c/diff.cpp b/src/api/c/diff.cpp
index 75ce5d82e4..8bc4d07da5 100644
--- a/src/api/c/diff.cpp
+++ b/src/api/c/diff.cpp
@@ -54,6 +54,8 @@ af_err af_diff1(af_array *out, const af_array in, const int dim)
             case u32: output = diff1<uint   >(in,dim);  break;
             case s64: output = diff1<intl   >(in,dim);  break;
             case u64: output = diff1<uintl  >(in,dim);  break;
+            case s16: output = diff1<short  >(in,dim);  break;
+            case u16: output = diff1<ushort >(in,dim);  break;
             case u8:  output = diff1<uchar  >(in,dim);  break;
             default:  TYPE_ERROR(1, type);
         }
@@ -89,6 +91,8 @@ af_err af_diff2(af_array *out, const af_array in, const int dim)
             case u32: output = diff2<uint   >(in,dim);  break;
             case s64: output = diff2<intl   >(in,dim);  break;
             case u64: output = diff2<uintl  >(in,dim);  break;
+            case s16: output = diff2<short  >(in,dim);  break;
+            case u16: output = diff2<ushort >(in,dim);  break;
             case u8:  output = diff2<uchar  >(in,dim);  break;
             default:  TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/dog.cpp b/src/api/c/dog.cpp
index 3c490cb5eb..3cf793cca5 100644
--- a/src/api/c/dog.cpp
+++ b/src/api/c/dog.cpp
@@ -28,19 +28,19 @@ static af_array dog(const af_array& in, const int radius1, const int radius2)
     AF_CHECK(af_gaussian_kernel(&g1, 2*radius1+1, 2*radius1+1, 0.0, 0.0));
     AF_CHECK(af_gaussian_kernel(&g2, 2*radius2+1, 2*radius2+1, 0.0, 0.0));
 
-    Array<T> input  = getArray<T>(in);
+    Array<accT> input  = castArray<accT>(in);
     dim4 iDims      = input.dims();
 
     ConvolveBatchKind bkind = iDims[2] > 1 ? CONVOLVE_BATCH_SIGNAL : CONVOLVE_BATCH_NONE;
 
-    Array<T> smth1 = convolve<T, accT, 2, false>(input, castArray<accT>(g1), bkind);
-    Array<T> smth2 = convolve<T, accT, 2, false>(input, castArray<accT>(g2), bkind);
-    Array<T> retVal= arithOp<T, af_sub_t>(smth1, smth2, iDims);
+    Array<accT> smth1 = convolve<accT, accT, 2, false>(input, castArray<accT>(g1), bkind);
+    Array<accT> smth2 = convolve<accT, accT, 2, false>(input, castArray<accT>(g2), bkind);
+    Array<accT> retVal= arithOp<accT, af_sub_t>(smth1, smth2, iDims);
 
     AF_CHECK(af_release_array(g1));
     AF_CHECK(af_release_array(g2));
 
-    return getHandle<T>(retVal);
+    return getHandle<accT>(retVal);
 }
 
 af_err af_dog(af_array *out, const af_array in, const int radius1, const int radius2)
@@ -59,6 +59,8 @@ af_err af_dog(af_array *out, const af_array in, const int radius1, const int rad
             case b8 : output = dog<char  , float>(in, radius1, radius2); break;
             case s32: output = dog<int   , float>(in, radius1, radius2); break;
             case u32: output = dog<uint  , float>(in, radius1, radius2); break;
+            case s16: output = dog<short , float>(in, radius1, radius2); break;
+            case u16: output = dog<ushort, float>(in, radius1, radius2); break;
             case u8 : output = dog<uchar , float>(in, radius1, radius2); break;
             default : TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp
index fdbe82f78d..371bbd95fa 100644
--- a/src/api/c/err_common.cpp
+++ b/src/api/c/err_common.cpp
@@ -16,7 +16,7 @@
 #include <cstdio>
 #include <algorithm>
 
-#if defined(WITH_GRAPHICS)
+#if defined(WITH_GRAPHICS) && !defined(AF_UNIFIED)
 #include <graphics_common.hpp>
 #endif
 
@@ -179,6 +179,8 @@ const char *af_err_to_string(const af_err err)
     case AF_ERR_NOT_CONFIGURED: return "Function not configured to build";
     case AF_ERR_TYPE:           return "Function does not support this data type";
     case AF_ERR_NO_DBL:         return "Double precision not supported for this device";
+    case AF_ERR_LOAD_LIB:       return "Failed to load dynamic library";
+    case AF_ERR_LOAD_SYM:       return "Failed to load symbol";
     case AF_ERR_UNKNOWN:
     default:
         return "Unknown error";
@@ -229,7 +231,7 @@ af_err processException()
 
         print_error(ss);
         err = ex.getError();
-#if defined(WITH_GRAPHICS)
+#if defined(WITH_GRAPHICS) && !defined(AF_UNIFIED)
     } catch (const fg::Error &ex) {
         ss << ex << "\n";
         print_error(ss);
diff --git a/src/api/c/fast.cpp b/src/api/c/fast.cpp
index e28f590768..9a403195f1 100644
--- a/src/api/c/fast.cpp
+++ b/src/api/c/fast.cpp
@@ -70,6 +70,8 @@ af_err af_fast(af_features *out, const af_array in, const float thr,
             case b8 : *out = fast<char  >(in, thr, arc_length, non_max, feature_ratio, edge); break;
             case s32: *out = fast<int   >(in, thr, arc_length, non_max, feature_ratio, edge); break;
             case u32: *out = fast<uint  >(in, thr, arc_length, non_max, feature_ratio, edge); break;
+            case s16: *out = fast<short >(in, thr, arc_length, non_max, feature_ratio, edge); break;
+            case u16: *out = fast<ushort>(in, thr, arc_length, non_max, feature_ratio, edge); break;
             case u8 : *out = fast<uchar >(in, thr, arc_length, non_max, feature_ratio, edge); break;
             default : TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/fftconvolve.cpp b/src/api/c/fftconvolve.cpp
index fc3a91cc76..a7401058f0 100644
--- a/src/api/c/fftconvolve.cpp
+++ b/src/api/c/fftconvolve.cpp
@@ -143,6 +143,10 @@ af_err fft_convolve(af_array *out, const af_array signal, const af_array filter,
             case f32: output = fftconvolve<float , float,  cfloat,  false, false, baseDim>(signal, filter, expand, convBT); break;
             case u32: output = fftconvolve<uint  , float,  cfloat,  false, true,  baseDim>(signal, filter, expand, convBT); break;
             case s32: output = fftconvolve<int   , float,  cfloat,  false, true,  baseDim>(signal, filter, expand, convBT); break;
+            case u64: output = fftconvolve<uintl , float,  cfloat,  false, true,  baseDim>(signal, filter, expand, convBT); break;
+            case s64: output = fftconvolve<intl  , float,  cfloat,  false, true,  baseDim>(signal, filter, expand, convBT); break;
+            case u16: output = fftconvolve<ushort, float,  cfloat,  false, true,  baseDim>(signal, filter, expand, convBT); break;
+            case s16: output = fftconvolve<short , float,  cfloat,  false, true,  baseDim>(signal, filter, expand, convBT); break;
             case u8:  output = fftconvolve<uchar , float,  cfloat,  false, true,  baseDim>(signal, filter, expand, convBT); break;
             case b8:  output = fftconvolve<char  , float,  cfloat,  false, true,  baseDim>(signal, filter, expand, convBT); break;
             case c32: output = fftconvolve_fallback<cfloat , cfloat , cfloat , baseDim>(signal, filter, expand); break;
diff --git a/src/api/c/filters.cpp b/src/api/c/filters.cpp
index 4658604937..5be7322d98 100644
--- a/src/api/c/filters.cpp
+++ b/src/api/c/filters.cpp
@@ -54,6 +54,8 @@ af_err af_medfilt(af_array *out, const af_array in, const dim_t wind_length, con
                 case b8 : output = medfilt<char  >(in, wind_length, wind_width, edge_pad); break;
                 case s32: output = medfilt<int   >(in, wind_length, wind_width, edge_pad); break;
                 case u32: output = medfilt<uint  >(in, wind_length, wind_width, edge_pad); break;
+                case s16: output = medfilt<short >(in, wind_length, wind_width, edge_pad); break;
+                case u16: output = medfilt<ushort>(in, wind_length, wind_width, edge_pad); break;
                 case u8 : output = medfilt<uchar >(in, wind_length, wind_width, edge_pad); break;
                 default : TYPE_ERROR(1, type);
             }
diff --git a/src/api/c/flip.cpp b/src/api/c/flip.cpp
index a88c217780..3d5bf53da8 100644
--- a/src/api/c/flip.cpp
+++ b/src/api/c/flip.cpp
@@ -69,6 +69,8 @@ af_err af_flip(af_array *result, const af_array in, const unsigned dim)
         case u32:    out = flipArray<unsigned>(in, dim);  break;
         case s64:    out = flipArray<intl>    (in, dim);  break;
         case u64:    out = flipArray<uintl>   (in, dim);  break;
+        case s16:    out = flipArray<short>   (in, dim);  break;
+        case u16:    out = flipArray<ushort>  (in, dim);  break;
         case u8:     out = flipArray<uchar>   (in, dim);  break;
         default:    TYPE_ERROR(1, in_type);
         }
diff --git a/src/api/c/graphics_common.cpp b/src/api/c/graphics_common.cpp
index bec27feee4..4b50bc046e 100644
--- a/src/api/c/graphics_common.cpp
+++ b/src/api/c/graphics_common.cpp
@@ -20,13 +20,15 @@ template<typename T>
 GLenum getGLType() { return GL_FLOAT; }
 
 #define INSTANTIATE_GET_FG_TYPE(T, ForgeEnum)\
-    template<> fg::FGType getGLType<T>() { return ForgeEnum; }
+    template<> fg::dtype getGLType<T>() { return ForgeEnum; }
 
-INSTANTIATE_GET_FG_TYPE(float, fg::FG_FLOAT);
-INSTANTIATE_GET_FG_TYPE(int  , fg::FG_INT);
-INSTANTIATE_GET_FG_TYPE(unsigned, fg::FG_UNSIGNED_INT);
-INSTANTIATE_GET_FG_TYPE(char, fg::FG_BYTE);
-INSTANTIATE_GET_FG_TYPE(unsigned char, fg::FG_UNSIGNED_BYTE);
+INSTANTIATE_GET_FG_TYPE(float, fg::f32);
+INSTANTIATE_GET_FG_TYPE(int  , fg::s32);
+INSTANTIATE_GET_FG_TYPE(unsigned, fg::u32);
+INSTANTIATE_GET_FG_TYPE(char, fg::s8);
+INSTANTIATE_GET_FG_TYPE(unsigned char, fg::u8);
+INSTANTIATE_GET_FG_TYPE(unsigned short, fg::u16);
+INSTANTIATE_GET_FG_TYPE(short, fg::s16);
 
 GLenum glErrorSkip(const char *msg, const char* file, int line)
 {
@@ -78,6 +80,8 @@ size_t getTypeSize(GLenum type)
         case GL_FLOAT:          return sizeof(float);
         case GL_INT:            return sizeof(int  );
         case GL_UNSIGNED_INT:   return sizeof(unsigned);
+        case GL_SHORT:          return sizeof(short);
+        case GL_UNSIGNED_SHORT: return sizeof(unsigned short);
         case GL_BYTE:           return sizeof(char );
         case GL_UNSIGNED_BYTE:  return sizeof(unsigned char);
         default: return sizeof(float);
@@ -136,7 +140,7 @@ fg::Window* ForgeManager::getMainWindow(const bool dontCreate)
     return wnd;
 }
 
-fg::Image* ForgeManager::getImage(int w, int h, fg::ColorMode mode, fg::FGType type)
+fg::Image* ForgeManager::getImage(int w, int h, fg::ChannelFormat mode, fg::dtype type)
 {
     /* w, h needs to fall in the range of [0, 2^16]
      * for the ForgeManager to correctly retrieve
@@ -157,7 +161,7 @@ fg::Image* ForgeManager::getImage(int w, int h, fg::ColorMode mode, fg::FGType t
     return mImgMap[key];
 }
 
-fg::Plot* ForgeManager::getPlot(int nPoints, fg::FGType type)
+fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype type)
 {
     /* nPoints needs to fall in the range of [0, 2^48]
      * for the ForgeManager to correctly retrieve
@@ -176,7 +180,26 @@ fg::Plot* ForgeManager::getPlot(int nPoints, fg::FGType type)
     return mPltMap[key];
 }
 
-fg::Histogram* ForgeManager::getHistogram(int nBins, fg::FGType type)
+fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype type)
+{
+    /* nPoints needs to fall in the range of [0, 2^48]
+     * for the ForgeManager to correctly retrieve
+     * the necessary Forge Plot object. So, this implementation
+     * is a limitation on how big of an plot graph can be rendered
+     * using arrayfire graphics funtionality */
+    assert(nPoints <= 2ll<<48);
+    long long key = ((nPoints & _48BIT) << 48) | (type & _16BIT);
+
+    Plt3MapIter iter = mPlt3Map.find(key);
+    if (iter==mPlt3Map.end()) {
+        fg::Plot3* temp = new fg::Plot3(nPoints, type);
+        mPlt3Map[key] = temp;
+    }
+
+    return mPlt3Map[key];
+}
+
+fg::Histogram* ForgeManager::getHistogram(int nBins, fg::dtype type)
 {
     /* nBins needs to fall in the range of [0, 2^48]
      * for the ForgeManager to correctly retrieve
@@ -195,6 +218,25 @@ fg::Histogram* ForgeManager::getHistogram(int nBins, fg::FGType type)
     return mHstMap[key];
 }
 
+fg::Surface* ForgeManager::getSurface(int nX, int nY, fg::dtype type)
+{
+    /* nX * nY needs to fall in the range of [0, 2^48]
+     * for the ForgeManager to correctly retrieve
+     * the necessary Forge Plot object. So, this implementation
+     * is a limitation on how big of an plot graph can be rendered
+     * using arrayfire graphics funtionality */
+    assert(nX * nY <= 2ll<<48);
+    long long key = (((nX * nY) & _48BIT) << 48) | (type & _16BIT);
+
+    SfcMapIter iter = mSfcMap.find(key);
+    if (iter==mSfcMap.end()) {
+        fg::Surface* temp = new fg::Surface(nX, nY, type);
+        mSfcMap[key] = temp;
+    }
+
+    return mSfcMap[key];
+}
+
 void ForgeManager::destroyResources()
 {
     /* clear all OpenGL resource objects (images, plots, histograms etc) first
diff --git a/src/api/c/graphics_common.hpp b/src/api/c/graphics_common.hpp
index ac6f4c0bcd..39225e6a0c 100644
--- a/src/api/c/graphics_common.hpp
+++ b/src/api/c/graphics_common.hpp
@@ -18,7 +18,7 @@
 
 // default to f32(float) type
 template<typename T>
-fg::FGType getGLType();
+fg::dtype getGLType();
 
 // Print for OpenGL errors
 // Returns 1 if an OpenGL error occurred, 0 otherwise.
@@ -45,10 +45,14 @@ static const long long _48BIT = 0x0000FFFFFFFFFFFF;
 typedef std::map<long long, fg::Image*> ImageMap_t;
 typedef std::map<long long, fg::Plot*> PlotMap_t;
 typedef std::map<long long, fg::Histogram*> HistogramMap_t;
+typedef std::map<long long, fg::Plot3*> Plot3Map_t;
+typedef std::map<long long, fg::Surface*> SurfaceMap_t;
 
 typedef ImageMap_t::iterator ImgMapIter;
 typedef PlotMap_t::iterator PltMapIter;
+typedef Plot3Map_t::iterator Plt3MapIter;
 typedef HistogramMap_t::iterator HstMapIter;
+typedef SurfaceMap_t::iterator SfcMapIter;
 
 /**
  * ForgeManager class follows a single pattern. Any user of this class, has
@@ -58,14 +62,18 @@ typedef HistogramMap_t::iterator HstMapIter;
  * Renderables:
  *             fg::Image
  *             fg::Plot
+ *             fg::Plot3
  *             fg::Histogram
+ *             fg::Surface
  * */
 class ForgeManager
 {
     private:
         ImageMap_t      mImgMap;
         PlotMap_t       mPltMap;
+        Plot3Map_t      mPlt3Map;
         HistogramMap_t  mHstMap;
+        SurfaceMap_t    mSfcMap;
 
     public:
         static ForgeManager& getInstance();
@@ -73,9 +81,11 @@ class ForgeManager
 
         fg::Font* getFont(const bool dontCreate=false);
         fg::Window* getMainWindow(const bool dontCreate=false);
-        fg::Image* getImage(int w, int h, fg::ColorMode mode, fg::FGType type);
-        fg::Plot* getPlot(int nPoints, fg::FGType type);
-        fg::Histogram* getHistogram(int nBins, fg::FGType type);
+        fg::Image* getImage(int w, int h, fg::ChannelFormat mode, fg::dtype type);
+        fg::Plot* getPlot(int nPoints, fg::dtype type);
+        fg::Plot3* getPlot3(int nPoints, fg::dtype type);
+        fg::Histogram* getHistogram(int nBins, fg::dtype type);
+        fg::Surface* getSurface(int nX, int nY, fg::dtype type);
 
     protected:
         ForgeManager() {}
diff --git a/src/api/c/handle.hpp b/src/api/c/handle.hpp
index beb8393907..70f17eb18e 100644
--- a/src/api/c/handle.hpp
+++ b/src/api/c/handle.hpp
@@ -31,6 +31,7 @@ detail::Array<To> castArray(const af_array &in)
     using detail::cdouble;
     using detail::uint;
     using detail::uchar;
+    using detail::ushort;
 
     const ArrayInfo info = getInfo(in);
     switch (info.getType()) {
@@ -44,6 +45,8 @@ detail::Array<To> castArray(const af_array &in)
     case b8 : return detail::cast<To, char   >(getArray<char   >(in));
     case s64: return detail::cast<To, intl   >(getArray<intl   >(in));
     case u64: return detail::cast<To, uintl  >(getArray<uintl  >(in));
+    case s16: return detail::cast<To, short  >(getArray<short  >(in));
+    case u16: return detail::cast<To, ushort >(getArray<ushort >(in));
     default: TYPE_ERROR(1, info.getType());
     }
 }
diff --git a/src/api/c/hist.cpp b/src/api/c/hist.cpp
index 8cf1ac16a4..4ddf43bbb4 100644
--- a/src/api/c/hist.cpp
+++ b/src/api/c/hist.cpp
@@ -39,8 +39,7 @@ fg::Histogram* setup_histogram(const af_array in, const double minval, const dou
     /* set x axis limits to maximum and minimum values of data
      * and y axis limits to range [0, nBins]*/
     hist->setAxesLimits(maxval, minval, double(freqMax), 0.0f);
-    hist->setXAxisTitle("Bins");
-    hist->setYAxisTitle("Frequency");
+    hist->setAxesTitles("Bins", "Frequency");
 
     copy_histogram<T>(histogramInput, hist);
 
@@ -71,6 +70,8 @@ af_err af_draw_hist(const af_window wind, const af_array X, const double minval,
             case f32: hist = setup_histogram<float  >(X, minval, maxval); break;
             case s32: hist = setup_histogram<int    >(X, minval, maxval); break;
             case u32: hist = setup_histogram<uint   >(X, minval, maxval); break;
+            case s16: hist = setup_histogram<short  >(X, minval, maxval); break;
+            case u16: hist = setup_histogram<ushort >(X, minval, maxval); break;
             case u8 : hist = setup_histogram<uchar  >(X, minval, maxval); break;
             default:  TYPE_ERROR(1, Xtype);
         }
diff --git a/src/api/c/histeq.cpp b/src/api/c/histeq.cpp
index 1b14ae54b2..78c3f16e4a 100644
--- a/src/api/c/histeq.cpp
+++ b/src/api/c/histeq.cpp
@@ -77,6 +77,10 @@ af_err af_hist_equal(af_array *out, const af_array in, const af_array hist)
             case f32: output = hist_equal<float , uint>(in, hist); break;
             case s32: output = hist_equal<int   , uint>(in, hist); break;
             case u32: output = hist_equal<uint  , uint>(in, hist); break;
+            case s16: output = hist_equal<short , uint>(in, hist); break;
+            case u16: output = hist_equal<ushort, uint>(in, hist); break;
+            case s64: output = hist_equal<intl  , uint>(in, hist); break;
+            case u64: output = hist_equal<uintl , uint>(in, hist); break;
             case u8 : output = hist_equal<uchar , uint>(in, hist); break;
             default : TYPE_ERROR(1, dataType);
         }
diff --git a/src/api/c/histogram.cpp b/src/api/c/histogram.cpp
index a7c4be7247..cd6dee8e30 100644
--- a/src/api/c/histogram.cpp
+++ b/src/api/c/histogram.cpp
@@ -42,6 +42,10 @@ af_err af_histogram(af_array *out, const af_array in,
             case b8 : output = histogram<char  , uint>(in, nbins, minval, maxval, info.isLinear()); break;
             case s32: output = histogram<int   , uint>(in, nbins, minval, maxval, info.isLinear()); break;
             case u32: output = histogram<uint  , uint>(in, nbins, minval, maxval, info.isLinear()); break;
+            case s16: output = histogram<short , uint>(in, nbins, minval, maxval, info.isLinear()); break;
+            case u16: output = histogram<ushort, uint>(in, nbins, minval, maxval, info.isLinear()); break;
+            case s64: output = histogram<intl  , uint>(in, nbins, minval, maxval, info.isLinear()); break;
+            case u64: output = histogram<uintl , uint>(in, nbins, minval, maxval, info.isLinear()); break;
             case u8 : output = histogram<uchar , uint>(in, nbins, minval, maxval, info.isLinear()); break;
             default : TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/homography.cpp b/src/api/c/homography.cpp
new file mode 100644
index 0000000000..c8fc9bd0ec
--- /dev/null
+++ b/src/api/c/homography.cpp
@@ -0,0 +1,88 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/defines.h>
+#include <af/vision.h>
+#include <err_common.hpp>
+#include <handle.hpp>
+#include <backend.hpp>
+#include <ArrayInfo.hpp>
+#include <homography.hpp>
+
+using af::dim4;
+using namespace detail;
+
+template<typename T>
+static inline void homography(af_array &H, int &inliers,
+                              const af_array x_src, const af_array y_src,
+                              const af_array x_dst, const af_array y_dst,
+                              const af_homography_type htype, const float inlier_thr,
+                              const unsigned iterations)
+{
+    Array<T> bestH = createEmptyArray<T>(af::dim4(3, 3));
+
+    inliers = homography<T>(bestH,
+                            getArray<float>(x_src), getArray<float>(y_src),
+                            getArray<float>(x_dst), getArray<float>(y_dst),
+                            htype, inlier_thr, iterations);
+
+    H = getHandle<T>(bestH);
+}
+
+af_err af_homography(af_array *H, int *inliers,
+                     const af_array x_src, const af_array y_src,
+                     const af_array x_dst, const af_array y_dst,
+                     const af_homography_type htype, const float inlier_thr,
+                     const unsigned iterations, const af_dtype otype)
+{
+    try {
+        ArrayInfo xsinfo = getInfo(x_src);
+        ArrayInfo ysinfo = getInfo(y_src);
+        ArrayInfo xdinfo = getInfo(x_dst);
+        ArrayInfo ydinfo = getInfo(y_dst);
+
+        af::dim4 xsdims  = xsinfo.dims();
+        af::dim4 ysdims  = ysinfo.dims();
+        af::dim4 xddims  = xdinfo.dims();
+        af::dim4 yddims  = ydinfo.dims();
+
+        af_dtype xstype = xsinfo.getType();
+        af_dtype ystype = ysinfo.getType();
+        af_dtype xdtype = xdinfo.getType();
+        af_dtype ydtype = ydinfo.getType();
+
+        if (xstype != f32) { TYPE_ERROR(1, xstype); }
+        if (ystype != f32) { TYPE_ERROR(2, ystype); }
+        if (xdtype != f32) { TYPE_ERROR(3, xdtype); }
+        if (ydtype != f32) { TYPE_ERROR(4, ydtype); }
+
+        ARG_ASSERT(1, (xsdims[0] > 0));
+        ARG_ASSERT(2, (ysdims[0] == xsdims[0]));
+        ARG_ASSERT(3, (xddims[0] > 0));
+        ARG_ASSERT(4, (yddims[0] == yddims[0]));
+
+        ARG_ASSERT(5, (inlier_thr >= 0.1f));
+        ARG_ASSERT(6, (iterations > 0));
+
+        af_array outH;
+        int outInl;
+
+        switch(otype) {
+            case f32: homography<float >(outH, outInl, x_src, y_src, x_dst, y_dst, htype, inlier_thr, iterations);  break;
+            case f64: homography<double>(outH, outInl, x_src, y_src, x_dst, y_dst, htype, inlier_thr, iterations);  break;
+            default:  TYPE_ERROR(1, otype);
+        }
+        std::swap(*H, outH);
+        std::swap(*inliers, outInl);
+    }
+    CATCHALL;
+
+    return AF_SUCCESS;
+}
diff --git a/src/api/c/image.cpp b/src/api/c/image.cpp
index c59b31b23c..ee2520cfc1 100644
--- a/src/api/c/image.cpp
+++ b/src/api/c/image.cpp
@@ -40,8 +40,7 @@ Array<T> normalizePerType(const Array<T>& in)
 {
     Array<float> inFloat = cast<float, T>(in);
 
-    Array<float> cnst = createValueArray<float>(in.dims(),
-                             std::numeric_limits<T>::max()/(255.0f+1.0e-6f));
+    Array<float> cnst = createValueArray<float>(in.dims(), 1.0 - 1.0e-6f);
 
     Array<float> scaled = arithOp<float, af_mul_t>(inFloat, cnst, in.dims());
 
@@ -66,7 +65,9 @@ static fg::Image* convert_and_copy_image(const af_array in)
 
     ForgeManager& fgMngr = ForgeManager::getInstance();
 
-    fg::Image* ret_val = fgMngr.getImage(inDims[1], inDims[0], (fg::ColorMode)inDims[2], getGLType<T>());
+    // The inDims[2] * 100 is a hack to convert to fg::ChannelFormat
+    // TODO Write a proper conversion function
+    fg::Image* ret_val = fgMngr.getImage(inDims[1], inDims[0], (fg::ChannelFormat)(inDims[2] * 100), getGLType<T>());
 
     copy_image<T>(normalizePerType<T>(imgData), ret_val);
 
@@ -95,11 +96,13 @@ af_err af_draw_image(const af_window wind, const af_array in, const af_cell* con
         fg::Image* image = NULL;
 
         switch(type) {
-            case f32: image = convert_and_copy_image<float>(in); break;
-            case b8 : image = convert_and_copy_image<char >(in); break;
-            case s32: image = convert_and_copy_image<int  >(in); break;
-            case u32: image = convert_and_copy_image<uint >(in); break;
-            case u8 : image = convert_and_copy_image<uchar>(in); break;
+            case f32: image = convert_and_copy_image<float >(in); break;
+            case b8 : image = convert_and_copy_image<char  >(in); break;
+            case s32: image = convert_and_copy_image<int   >(in); break;
+            case u32: image = convert_and_copy_image<uint  >(in); break;
+            case s16: image = convert_and_copy_image<short >(in); break;
+            case u16: image = convert_and_copy_image<ushort>(in); break;
+            case u8 : image = convert_and_copy_image<uchar >(in); break;
             default:  TYPE_ERROR(1, type);
         }
 
@@ -233,7 +236,7 @@ af_err af_show(const af_window wind)
 
     try {
         fg::Window* wnd = reinterpret_cast<fg::Window*>(wind);
-        wnd->draw();
+        wnd->swapBuffers();
     }
     CATCHALL;
     return AF_SUCCESS;
diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp
index 813a2d3294..3442a2adff 100644
--- a/src/api/c/imageio.cpp
+++ b/src/api/c/imageio.cpp
@@ -9,20 +9,21 @@
 
 #if defined(WITH_FREEIMAGE)
 
+#include "imageio_helper.h"
+
 #include <af/array.h>
+#include <af/index.h>
+#include <af/dim4.hpp>
 #include <af/arith.h>
 #include <af/algorithm.h>
 #include <af/blas.h>
 #include <af/data.h>
 #include <af/image.h>
-#include <af/index.h>
-#include <err_common.hpp>
 #include <backend.hpp>
 #include <ArrayInfo.hpp>
 #include <traits.hpp>
 #include <memory.hpp>
 
-#include <FreeImage.h>
 #include <string>
 #include <cstring>
 #include <cstdio>
@@ -31,74 +32,9 @@
 using af::dim4;
 using namespace detail;
 
-class FI_Manager
-{
-    public:
-    static bool initialized;
-    FI_Manager()
-    {
-#ifdef FREEIMAGE_LIB
-        FreeImage_Initialise();
-#endif
-        initialized = true;
-    }
-
-    ~FI_Manager()
-    {
-#ifdef FREEIMAGE_LIB
-        FreeImage_DeInitialise();
-#endif
-    }
-};
-
 bool FI_Manager::initialized = false;
 
-static void FI_Init()
-{
-    static FI_Manager manager = FI_Manager();
-}
-
-// Helpers
-void FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, const char* zMessage);
-
-typedef unsigned short ushort;
-
-// Error handler for FreeImage library.
-// In case this handler is invoked, it throws an af exception.
-void FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, const char* zMessage)
-{
-    printf("FreeImage Error Handler: %s\n", zMessage);
-}
-
-//  Split a MxNx3 image into 3 separate channel matrices.
-//  Produce 3 channels if needed
-static af_err channel_split(const af_array rgb, const af::dim4 &dims,
-                            af_array *outr, af_array *outg, af_array *outb, af_array *outa)
-{
-    try {
-        af_seq idx[4][3] = {{af_span, af_span, {0, 0, 1}},
-                            {af_span, af_span, {1, 1, 1}},
-                            {af_span, af_span, {2, 2, 1}},
-                            {af_span, af_span, {3, 3, 1}}
-                           };
-
-        if (dims[2] == 4) {
-            AF_CHECK(af_index(outr, rgb, dims.ndims(), idx[0]));
-            AF_CHECK(af_index(outg, rgb, dims.ndims(), idx[1]));
-            AF_CHECK(af_index(outb, rgb, dims.ndims(), idx[2]));
-            AF_CHECK(af_index(outa, rgb, dims.ndims(), idx[3]));
-        } else if (dims[2] == 3) {
-            AF_CHECK(af_index(outr, rgb, dims.ndims(), idx[0]));
-            AF_CHECK(af_index(outg, rgb, dims.ndims(), idx[1]));
-            AF_CHECK(af_index(outb, rgb, dims.ndims(), idx[2]));
-        } else {
-            AF_CHECK(af_index(outr, rgb, dims.ndims(), idx[0]));
-        }
-    } CATCHALL;
-    return AF_SUCCESS;
-}
-
-template<typename T, int fi_color, int fo_color>
+template<typename T, FI_CHANNELS fi_color, FI_CHANNELS fo_color>
 static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcPitch,
                         const uint fi_w, const uint fi_h)
 {
@@ -110,20 +46,28 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP
     float* pDst2 = pDst + (fi_w * fi_h * 2);
     float* pDst3 = pDst + (fi_w * fi_h * 3);
 
-    int offR = 2; int offG = 1; int offB = 0; int offA = 3;
-    if (fo_color == 3 && fi_color == 1) {       //Convert gray to color
-        offG = 0; offR = 0;
-    }
     uint indx = 0;
     uint step = fi_color;
 
     for (uint x = 0; x < fi_w; ++x) {
         for (uint y = 0; y < fi_h; ++y) {
             const T *src = (T*)(pSrcLine - y * nSrcPitch);
-                               pDst2[indx] = (float) *(src + (x * step + offB));
-            if (fo_color >= 3) pDst1[indx] = (float) *(src + (x * step + offG));
-            if (fo_color >= 3) pDst0[indx] = (float) *(src + (x * step + offR));
-            if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + offA));
+            if(fo_color == 1) {
+                pDst0[indx] = (T) *(src + (x * step));
+            } else if(fo_color >= 3) {
+                if((af_dtype) af::dtype_traits<T>::af_type == u8) {
+                    pDst0[indx] = (float) *(src + (x * step + FI_RGBA_RED));
+                    pDst1[indx] = (float) *(src + (x * step + FI_RGBA_GREEN));
+                    pDst2[indx] = (float) *(src + (x * step + FI_RGBA_BLUE));
+                } else {
+                    // Non 8-bit types do not use ordering
+                    // See Pixel Access Functions Chapter in FreeImage Doc
+                    pDst0[indx] = (float) *(src + (x * step + 0));
+                    pDst1[indx] = (float) *(src + (x * step + 1));
+                    pDst2[indx] = (float) *(src + (x * step + 2));
+                }
+                if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + FI_RGBA_ALPHA));
+            }
             indx++;
         }
     }
@@ -135,7 +79,7 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP
     return err;
 }
 
-template<typename T, int fo_color>
+template<typename T, FI_CHANNELS fo_color>
 static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcPitch,
                         const uint fi_w, const uint fi_h)
 {
@@ -149,12 +93,20 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP
     for (uint x = 0; x < fi_w; ++x) {
         for (uint y = 0; y < fi_h; ++y) {
             const T *src = (T*)(pSrcLine - y * nSrcPitch);
-            if (fo_color == 1) {
-                pDst[indx] = (float) *(src + (x * step));
-            } else if (fo_color >=3) {
-                r = (float) *(src + (x * step + 2));
-                g = (float) *(src + (x * step + 1));
-                b = (float) *(src + (x * step + 0));
+            if(fo_color == 1) {
+                pDst[indx] = (T) *(src + (x * step));
+            } else if(fo_color >= 3) {
+                if((af_dtype) af::dtype_traits<T>::af_type == u8) {
+                    r = (T) *(src + (x * step + FI_RGBA_RED));
+                    g = (T) *(src + (x * step + FI_RGBA_GREEN));
+                    b = (T) *(src + (x * step + FI_RGBA_BLUE));
+                } else {
+                    // Non 8-bit types do not use ordering
+                    // See Pixel Access Functions Chapter in FreeImage Doc
+                    r = (T) *(src + (x * step + 0));
+                    g = (T) *(src + (x * step + 1));
+                    b = (T) *(src + (x * step + 2));
+                }
                 pDst[indx] = r * 0.2989f + g * 0.5870f + b * 0.1140f;
             }
             indx++;
@@ -192,16 +144,23 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor)
             AF_ERROR("FreeImage Error: Unknown File or Filetype", AF_ERR_NOT_SUPPORTED);
         }
 
+        int flags = 0;
+        if(fif == FIF_JPEG) flags = flags | JPEG_ACCURATE;
+        if(fif == FIF_JPEG && !isColor) flags = flags | JPEG_GREYSCALE;
+
         // check that the plugin has reading capabilities ...
         FIBITMAP* pBitmap = NULL;
         if (FreeImage_FIFSupportsReading(fif)) {
-            pBitmap = FreeImage_Load(fif, filename);
+            pBitmap = FreeImage_Load(fif, filename, flags);
         }
 
         if(pBitmap == NULL) {
             AF_ERROR("FreeImage Error: Error reading image or file does not exist", AF_ERR_RUNTIME);
         }
 
+        // make sure pBitmap is unleaded automatically, no matter how we exit this function
+        FI_BitmapResource bitmapUnloader(pBitmap);
+
         // check image color type
         uint color_type = FreeImage_GetColorType(pBitmap);
         const uint fi_bpp = FreeImage_GetBPP(pBitmap);
@@ -239,45 +198,44 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor)
         if (isColor) {
             if(fi_color == 4) {     //4 channel image
                 if(fi_bpc == 8)
-                    AF_CHECK((readImage<uchar, 4, 4>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<uchar,  AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 16)
-                    AF_CHECK((readImage<ushort, 4, 4>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<ushort, AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float, 4, 4>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<float,  AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             } else if (fi_color == 1) {
                 if(fi_bpc == 8)
-                    AF_CHECK((readImage<uchar, 1, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<uchar,  AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 16)
-                    AF_CHECK((readImage<ushort, 1, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<ushort, AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float, 1, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<float,  AFFI_GRAY, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             } else {             //3 channel image
                 if(fi_bpc == 8)
-                    AF_CHECK((readImage<uchar, 3, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<uchar,  AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 16)
-                    AF_CHECK((readImage<ushort, 3, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<ushort, AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float, 3, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<float,  AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             }
         } else {                    //output gray irrespective
             if(fi_color == 1) {     //4 channel image
                 if(fi_bpc == 8)
-                    AF_CHECK((readImage<uchar, 1>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<uchar,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 16)
-                    AF_CHECK((readImage<ushort, 1>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<ushort, AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float, 1>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<float,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             } else if (fi_color == 3 || fi_color == 4) {
                 if(fi_bpc == 8)
-                    AF_CHECK((readImage<uchar, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<uchar,  AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 16)
-                    AF_CHECK((readImage<ushort, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<ushort, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
                 else if(fi_bpc == 32)
-                    AF_CHECK((readImage<float, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                    AF_CHECK((readImage<float,  AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             }
         }
 
-        FreeImage_Unload(pBitmap);
         std::swap(*out,rImage);
     } CATCHALL;
 
@@ -324,6 +282,9 @@ af_err af_save_image(const char* filename, const af_array in_)
             AF_ERROR("FreeImage Error: Error creating image or file", AF_ERR_RUNTIME);
         }
 
+        // make sure pResultBitmap is unleaded automatically, no matter how we exit this function
+        FI_BitmapResource resultBitmapUnloader(pResultBitmap);
+
         // FI assumes [0-255]
         // If array is in 0-1 range, multiply by 255
         af_array in;
@@ -331,7 +292,7 @@ af_err af_save_image(const char* filename, const af_array in_)
         bool free_in = false;
         AF_CHECK(af_max_all(&max_real, &max_imag, in_));
         if (max_real <= 1) {
-            af_array c255;
+            af_array c255 = 0;
             AF_CHECK(af_constant(&c255, 255.0, info.ndims(), info.dims().get(), f32));
             AF_CHECK(af_mul(&in, in_, c255, false));
             AF_CHECK(af_release_array(c255));
@@ -371,9 +332,9 @@ af_err af_save_image(const char* filename, const af_array in_)
             // Copy the array into FreeImage buffer
             for (uint y = 0; y < fi_h; ++y) {
                 for (uint x = 0; x < fi_w; ++x) {
-                    *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b
-                    *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g
                     *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r
+                    *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g
+                    *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b
                     *(pDstLine + x * step + 3) = (uchar) pSrc3[indx]; // a
                     ++indx;
                 }
@@ -400,9 +361,9 @@ af_err af_save_image(const char* filename, const af_array in_)
             // Copy the array into FreeImage buffer
             for (uint y = 0; y < fi_h; ++y) {
                 for (uint x = 0; x < fi_w; ++x) {
-                    *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b
-                    *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g
                     *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r
+                    *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g
+                    *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b
                     ++indx;
                 }
                 pDstLine -= nDstPitch;
@@ -426,13 +387,14 @@ af_err af_save_image(const char* filename, const af_array in_)
             pinnedFree(pSrc0);
         }
 
+        int flags = 0;
+        if(fif == FIF_JPEG) flags = flags | JPEG_QUALITYSUPERB;
+
         // now save the result image
-        if (!(FreeImage_Save(fif, pResultBitmap, filename, 0) == TRUE)) {
+        if (!(FreeImage_Save(fif, pResultBitmap, filename, flags) == TRUE)) {
             AF_ERROR("FreeImage Error: Failed to save image", AF_ERR_RUNTIME);
         }
 
-        FreeImage_Unload(pResultBitmap);
-
         if(free_in) AF_CHECK(af_release_array(in ));
         if(rr != 0) AF_CHECK(af_release_array(rr ));
         if(gg != 0) AF_CHECK(af_release_array(gg ));
@@ -476,16 +438,22 @@ af_err af_load_image_memory(af_array *out, const void* ptr)
             AF_ERROR("FreeImage Error: Unknown File or Filetype", AF_ERR_NOT_SUPPORTED);
         }
 
+        int flags = 0;
+        if(fif == FIF_JPEG) flags = flags | JPEG_ACCURATE;
+
         // check that the plugin has reading capabilities ...
         FIBITMAP* pBitmap = NULL;
         if (FreeImage_FIFSupportsReading(fif)) {
-            pBitmap = FreeImage_LoadFromMemory(fif, stream, 0);
+            pBitmap = FreeImage_LoadFromMemory(fif, stream, flags);
         }
 
         if(pBitmap == NULL) {
             AF_ERROR("FreeImage Error: Error reading image or file does not exist", AF_ERR_RUNTIME);
         }
 
+        // make sure pBitmap is unleaded automatically, no matter how we exit this function
+        FI_BitmapResource bitmapUnloader(pBitmap);
+
         // check image color type
         uint color_type = FreeImage_GetColorType(pBitmap);
         const uint fi_bpp = FreeImage_GetBPP(pBitmap);
@@ -521,28 +489,27 @@ af_err af_load_image_memory(af_array *out, const void* ptr)
         af_array rImage;
         if(fi_color == 4) {     //4 channel image
             if(fi_bpc == 8)
-                AF_CHECK((readImage<uchar, 4, 4>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<uchar,  AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 16)
-                AF_CHECK((readImage<ushort,4, 4>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<ushort, AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 32)
-                AF_CHECK((readImage<float, 4, 4>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<float,  AFFI_RGBA, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
         } else if (fi_color == 1) { // 1 channel image
             if(fi_bpc == 8)
-                AF_CHECK((readImage<uchar, 1>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<uchar,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 16)
-                AF_CHECK((readImage<ushort,1>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<ushort, AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 32)
-                AF_CHECK((readImage<float, 1>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<float,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
         } else {             //3 channel image
             if(fi_bpc == 8)
-                AF_CHECK((readImage<uchar, 3, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<uchar,  AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 16)
-                AF_CHECK((readImage<ushort,3, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<ushort, AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
             else if(fi_bpc == 32)
-                AF_CHECK((readImage<float, 3, 3>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+                AF_CHECK((readImage<float,  AFFI_RGB, AFFI_RGB>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
         }
 
-        FreeImage_Unload(pBitmap);
         std::swap(*out,rImage);
     } CATCHALL;
 
@@ -584,6 +551,9 @@ af_err af_save_image_memory(void **ptr, const af_array in_, const af_image_forma
             AF_ERROR("FreeImage Error: Error creating image or file", AF_ERR_RUNTIME);
         }
 
+        // make sure pResultBitmap is unleaded automatically, no matter how we exit this function
+        FI_BitmapResource resultBitmapUnloader(pResultBitmap);
+
         // FI assumes [0-255]
         // If array is in 0-1 range, multiply by 255
         af_array in;
@@ -688,15 +658,16 @@ af_err af_save_image_memory(void **ptr, const af_array in_, const af_image_forma
 
         FIMEMORY *stream = FreeImage_OpenMemory();
 
+        int flags = 0;
+        if(fif == FIF_JPEG) flags = flags | JPEG_QUALITYSUPERB;
+
         // now save the result image
-        if (!(FreeImage_SaveToMemory(fif, pResultBitmap, stream, 0) == TRUE)) {
+        if (!(FreeImage_SaveToMemory(fif, pResultBitmap, stream, flags) == TRUE)) {
             AF_ERROR("FreeImage Error: Failed to save image", AF_ERR_RUNTIME);
         }
 
         *ptr = stream;
 
-        FreeImage_Unload(pResultBitmap);
-
         if(free_in) AF_CHECK(af_release_array(in ));
         if(rr != 0) AF_CHECK(af_release_array(rr ));
         if(gg != 0) AF_CHECK(af_release_array(gg ));
diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp
new file mode 100644
index 0000000000..de12fc7d8a
--- /dev/null
+++ b/src/api/c/imageio2.cpp
@@ -0,0 +1,389 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_FREEIMAGE)
+
+#include "imageio_helper.h"
+
+#include <af/array.h>
+#include <af/index.h>
+#include <af/dim4.hpp>
+#include <af/arith.h>
+#include <af/algorithm.h>
+#include <af/blas.h>
+#include <af/data.h>
+#include <af/image.h>
+#include <backend.hpp>
+#include <ArrayInfo.hpp>
+#include <traits.hpp>
+#include <memory.hpp>
+
+#include <string>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+
+using af::dim4;
+using namespace detail;
+
+template<typename T, FI_CHANNELS fi_color>
+static af_err readImage_t(af_array *rImage, const uchar* pSrcLine, const int nSrcPitch,
+                            const uint fi_w, const uint fi_h)
+{
+    // create an array to receive the loaded image data.
+    AF_CHECK(af_init());
+    T *pDst = pinnedAlloc<T>(fi_w * fi_h * 4); // 4 channels is max
+    T* pDst0 = pDst;
+    T* pDst1 = pDst + (fi_w * fi_h * 1);
+    T* pDst2 = pDst + (fi_w * fi_h * 2);
+    T* pDst3 = pDst + (fi_w * fi_h * 3);
+
+    uint indx = 0;
+    uint step = fi_color;
+
+    for (uint x = 0; x < fi_w; ++x) {
+        for (uint y = 0; y < fi_h; ++y) {
+            const T *src = (T*)((uchar*)pSrcLine - y * nSrcPitch);
+            if(fi_color == 1) {
+                pDst0[indx] = (T) *(src + (x * step));
+            } else if(fi_color >= 3) {
+                if((af_dtype) af::dtype_traits<T>::af_type == u8) {
+                    pDst0[indx] = (T) *(src + (x * step + FI_RGBA_RED));
+                    pDst1[indx] = (T) *(src + (x * step + FI_RGBA_GREEN));
+                    pDst2[indx] = (T) *(src + (x * step + FI_RGBA_BLUE));
+                } else {
+                    // Non 8-bit types do not use ordering
+                    // See Pixel Access Functions Chapter in FreeImage Doc
+                    pDst0[indx] = (T) *(src + (x * step + 0));
+                    pDst1[indx] = (T) *(src + (x * step + 1));
+                    pDst2[indx] = (T) *(src + (x * step + 2));
+                }
+                if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + FI_RGBA_ALPHA));
+            }
+            indx++;
+        }
+    }
+
+    // TODO
+    af::dim4 dims(fi_h, fi_w, fi_color, 1);
+    af_err err = af_create_array(rImage, pDst, dims.ndims(), dims.get(),
+                                 (af_dtype) af::dtype_traits<T>::af_type);
+    pinnedFree(pDst);
+    return err;
+}
+
+FREE_IMAGE_TYPE getFIT(FI_CHANNELS channels, af_dtype type)
+{
+    if(channels == AFFI_GRAY) {
+             if(type == u8 ) return FIT_BITMAP;
+        else if(type == u16) return FIT_UINT16;
+        else if(type == f32) return FIT_FLOAT;
+    } else if(channels == AFFI_RGB) {
+             if(type == u8 ) return FIT_BITMAP;
+        else if(type == u16) return FIT_RGB16;
+        else if(type == f32) return FIT_RGBF;
+    } else if(channels == AFFI_RGBA) {
+             if(type == u8 ) return FIT_BITMAP;
+        else if(type == u16) return FIT_RGBA16;
+        else if(type == f32) return FIT_RGBAF;
+    }
+    return FIT_BITMAP;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// File IO
+////////////////////////////////////////////////////////////////////////////////
+// Load image from disk.
+af_err af_load_image_native(af_array *out, const char* filename)
+{
+    try {
+        ARG_ASSERT(1, filename != NULL);
+
+        // for statically linked FI
+        FI_Init();
+
+        // set your own FreeImage error handler
+        FreeImage_SetOutputMessage(FreeImageErrorHandler);
+
+        // try to guess the file format from the file extension
+        FREE_IMAGE_FORMAT fif = FreeImage_GetFileType(filename);
+        if (fif == FIF_UNKNOWN) {
+            fif = FreeImage_GetFIFFromFilename(filename);
+        }
+
+        if(fif == FIF_UNKNOWN) {
+            AF_ERROR("FreeImage Error: Unknown File or Filetype", AF_ERR_NOT_SUPPORTED);
+        }
+
+        int flags = 0;
+        if(fif == FIF_JPEG) flags = flags | JPEG_ACCURATE;
+
+        // check that the plugin has reading capabilities ...
+        FIBITMAP* pBitmap = NULL;
+        if (FreeImage_FIFSupportsReading(fif)) {
+            pBitmap = FreeImage_Load(fif, filename, flags);
+        }
+
+        if(pBitmap == NULL) {
+            AF_ERROR("FreeImage Error: Error reading image or file does not exist", AF_ERR_RUNTIME);
+        }
+
+        // make sure pBitmap is unleaded automatically, no matter how we exit this function
+        FI_BitmapResource bitmapUnloader(pBitmap);
+
+        // check image color type
+        uint color_type = FreeImage_GetColorType(pBitmap);
+        const uint fi_bpp = FreeImage_GetBPP(pBitmap);
+        //int fi_color = (int)((fi_bpp / 8.0) + 0.5);        //ceil
+        int fi_color;
+        switch(color_type) {
+            case 0:                     // FIC_MINISBLACK
+            case 1:                     // FIC_MINISWHITE
+                fi_color = 1; break;
+            case 2:                     // FIC_PALETTE
+            case 3:                     // FIC_RGB
+                fi_color = 3; break;
+            case 4:                     // FIC_RGBALPHA
+            case 5:                     // FIC_CMYK
+                fi_color = 4; break;
+            default:                    // Should not come here
+                fi_color = 3; break;
+        }
+
+        const int fi_bpc = fi_bpp / fi_color;
+        if(fi_bpc != 8 && fi_bpc != 16 && fi_bpc != 32) {
+            AF_ERROR("FreeImage Error: Bits per channel not supported", AF_ERR_NOT_SUPPORTED);
+        }
+
+        // sizes
+        uint fi_w = FreeImage_GetWidth(pBitmap);
+        uint fi_h = FreeImage_GetHeight(pBitmap);
+
+        // FI = row major | AF = column major
+        uint nSrcPitch = FreeImage_GetPitch(pBitmap);
+        const uchar* pSrcLine = FreeImage_GetBits(pBitmap) + nSrcPitch * (fi_h - 1);
+
+        // result image
+        af_array rImage;
+        if(fi_color == 4) {     //4 channel image
+            if(fi_bpc == 8)
+                AF_CHECK((readImage_t<uchar,  AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+            else if(fi_bpc == 16)
+                AF_CHECK((readImage_t<ushort, AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+            else if(fi_bpc == 32)
+                AF_CHECK((readImage_t<float,  AFFI_RGBA>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+        } else if (fi_color == 1) {
+            if(fi_bpc == 8)
+                AF_CHECK((readImage_t<uchar,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+            else if(fi_bpc == 16)
+                AF_CHECK((readImage_t<ushort, AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+            else if(fi_bpc == 32)
+                AF_CHECK((readImage_t<float,  AFFI_GRAY>)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+        } else {             //3 channel imag
+            if(fi_bpc == 8)
+                AF_CHECK((readImage_t<uchar,  AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+            else if(fi_bpc == 16)
+                AF_CHECK((readImage_t<ushort, AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+            else if(fi_bpc == 32)
+                AF_CHECK((readImage_t<float,  AFFI_RGB >)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h));
+        }
+
+        std::swap(*out,rImage);
+    } CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+template<typename T, FI_CHANNELS channels>
+static void save_t(T* pDstLine, const af_array in, const dim4 dims, uint nDstPitch)
+{
+    af_array rr = 0, gg = 0, bb = 0, aa = 0;
+    AF_CHECK(channel_split(in, dims, &rr, &gg, &bb, &aa)); // convert array to 3 channels if needed
+
+    af_array rrT = 0, ggT = 0, bbT = 0, aaT = 0;
+    T *pSrc0 = 0, *pSrc1 = 0, *pSrc2 = 0, *pSrc3 = 0;
+
+    uint step = channels; // force 3 channels saving
+    uint indx = 0;
+
+                      AF_CHECK(af_transpose(&rrT, rr, false));
+    if(channels >= 3) AF_CHECK(af_transpose(&ggT, gg, false));
+    if(channels >= 3) AF_CHECK(af_transpose(&bbT, bb, false));
+    if(channels >= 4) AF_CHECK(af_transpose(&aaT, aa, false));
+
+    ArrayInfo cinfo = getInfo(rrT);
+                      pSrc0 = pinnedAlloc<T>(cinfo.elements());
+    if(channels >= 3) pSrc1 = pinnedAlloc<T>(cinfo.elements());
+    if(channels >= 3) pSrc2 = pinnedAlloc<T>(cinfo.elements());
+    if(channels >= 4) pSrc3 = pinnedAlloc<T>(cinfo.elements());
+
+                      AF_CHECK(af_get_data_ptr((void*)pSrc0, rrT));
+    if(channels >= 3) AF_CHECK(af_get_data_ptr((void*)pSrc1, ggT));
+    if(channels >= 3) AF_CHECK(af_get_data_ptr((void*)pSrc2, bbT));
+    if(channels >= 4) AF_CHECK(af_get_data_ptr((void*)pSrc3, aaT));
+
+    const uint fi_w = dims[1];
+    const uint fi_h = dims[0];
+
+    // Copy the array into FreeImage buffer
+    for (uint y = 0; y < fi_h; ++y) {
+        for (uint x = 0; x < fi_w; ++x) {
+            if(channels == 1) {
+                *(pDstLine + x * step + FI_RGBA_RED) = (T) pSrc0[indx]; // r -> 0
+            } else if(channels >=3) {
+                if((af_dtype) af::dtype_traits<T>::af_type == u8) {
+                    *(pDstLine + x * step + FI_RGBA_BLUE)  = (T) pSrc2[indx]; // b -> 0
+                    *(pDstLine + x * step + FI_RGBA_GREEN) = (T) pSrc1[indx]; // g -> 1
+                    *(pDstLine + x * step + FI_RGBA_RED)   = (T) pSrc0[indx]; // r -> 2
+                } else {
+                    // Non 8-bit types do not use ordering
+                    // See Pixel Access Functions Chapter in FreeImage Doc
+                    *(pDstLine + x * step + 0) = (T) pSrc0[indx]; // r -> 0
+                    *(pDstLine + x * step + 1) = (T) pSrc1[indx]; // g -> 1
+                    *(pDstLine + x * step + 2) = (T) pSrc2[indx]; // b -> 2
+                }
+            }
+            if(channels >= 4) *(pDstLine + x * step + FI_RGBA_ALPHA) = (T) pSrc3[indx]; // a
+            ++indx;
+        }
+        pDstLine = (T*)(((uchar*)pDstLine) - nDstPitch);
+    }
+                      pinnedFree(pSrc0);
+    if(channels >= 3) pinnedFree(pSrc1);
+    if(channels >= 3) pinnedFree(pSrc2);
+    if(channels >= 4) pinnedFree(pSrc3);
+
+    if(rr != 0) AF_CHECK(af_release_array(rr ));
+    if(gg != 0) AF_CHECK(af_release_array(gg ));
+    if(bb != 0) AF_CHECK(af_release_array(bb ));
+    if(aa != 0) AF_CHECK(af_release_array(aa ));
+    if(rrT!= 0) AF_CHECK(af_release_array(rrT));
+    if(ggT!= 0) AF_CHECK(af_release_array(ggT));
+    if(bbT!= 0) AF_CHECK(af_release_array(bbT));
+    if(aaT!= 0) AF_CHECK(af_release_array(aaT));
+}
+
+// Save an image to disk.
+af_err af_save_image_native(const char* filename, const af_array in)
+{
+    try {
+
+        ARG_ASSERT(0, filename != NULL);
+
+        FI_Init();
+
+        // set your own FreeImage error handler
+        FreeImage_SetOutputMessage(FreeImageErrorHandler);
+
+        // try to guess the file format from the file extension
+        FREE_IMAGE_FORMAT fif = FreeImage_GetFileType(filename);
+        if (fif == FIF_UNKNOWN) {
+            fif = FreeImage_GetFIFFromFilename(filename);
+        }
+
+        if(fif == FIF_UNKNOWN) {
+            AF_ERROR("FreeImage Error: Unknown Filetype", AF_ERR_NOT_SUPPORTED);
+        }
+
+        ArrayInfo info = getInfo(in);
+        // check image color type
+        FI_CHANNELS channels = (FI_CHANNELS)info.dims()[2];
+        DIM_ASSERT(1, channels <= 4);
+        DIM_ASSERT(1, channels != 2);
+
+        // sizes
+        uint fi_w = info.dims()[1];
+        uint fi_h = info.dims()[0];
+
+        af_dtype type = info.getType();
+
+        // FI assumes [0-255] for u8
+        // FI assumes [0-65k] for u16
+        // FI assumes [0-1]   for f32
+        int fi_bpp = 0;
+        switch(type) {
+            case u8:  fi_bpp = channels * 8; break;
+            case u16: fi_bpp = channels * 16; break;
+            case f32: fi_bpp = channels * 32; break;
+            default: TYPE_ERROR(1, type);
+        }
+
+        FREE_IMAGE_TYPE fit_type = getFIT(channels, type);
+
+        // create the result image storage using FreeImage
+        FIBITMAP* pResultBitmap = NULL;
+        switch(type) {
+            case u8:  pResultBitmap = FreeImage_AllocateT(fit_type, fi_w, fi_h, fi_bpp); break;
+            case u16: pResultBitmap = FreeImage_AllocateT(fit_type, fi_w, fi_h, fi_bpp); break;
+            case f32: pResultBitmap = FreeImage_AllocateT(fit_type, fi_w, fi_h, fi_bpp); break;
+            default: TYPE_ERROR(1, type);
+        }
+
+        if(pResultBitmap == NULL) {
+            AF_ERROR("FreeImage Error: Error creating image or file", AF_ERR_RUNTIME);
+        }
+
+        // make sure pResultBitmap is unloaded automatically, no matter how we exit this function
+        FI_BitmapResource resultBitmapUnloader(pResultBitmap);
+
+        // FI = row major | AF = column major
+        uint nDstPitch = FreeImage_GetPitch(pResultBitmap);
+        void* pDstLine = FreeImage_GetBits(pResultBitmap) + nDstPitch * (fi_h - 1);
+
+        if(channels == AFFI_GRAY) {
+            switch(type) {
+                case u8:  save_t<uchar , AFFI_GRAY>((uchar *)pDstLine, in, info.dims(), nDstPitch); break;
+                case u16: save_t<ushort, AFFI_GRAY>((ushort*)pDstLine, in, info.dims(), nDstPitch); break;
+                case f32: save_t<float , AFFI_GRAY>((float *)pDstLine, in, info.dims(), nDstPitch); break;
+                default: TYPE_ERROR(1, type);
+            }
+        } else if(channels == AFFI_RGB) {
+            switch(type) {
+                case u8:  save_t<uchar , AFFI_RGB >((uchar *)pDstLine, in, info.dims(), nDstPitch); break;
+                case u16: save_t<ushort, AFFI_RGB >((ushort*)pDstLine, in, info.dims(), nDstPitch); break;
+                case f32: save_t<float , AFFI_RGB >((float *)pDstLine, in, info.dims(), nDstPitch); break;
+                default: TYPE_ERROR(1, type);
+            }
+        } else {
+            switch(type) {
+                case u8:  save_t<uchar , AFFI_RGBA>((uchar *)pDstLine, in, info.dims(), nDstPitch); break;
+                case u16: save_t<ushort, AFFI_RGBA>((ushort*)pDstLine, in, info.dims(), nDstPitch); break;
+                case f32: save_t<float , AFFI_RGBA>((float *)pDstLine, in, info.dims(), nDstPitch); break;
+                default: TYPE_ERROR(1, type);
+            }
+        }
+
+        int flags = 0;
+        if(fif == FIF_JPEG) flags = flags | JPEG_QUALITYSUPERB;
+
+        // now save the result image
+        if (!(FreeImage_Save(fif, pResultBitmap, filename, flags) == TRUE)) {
+            AF_ERROR("FreeImage Error: Failed to save image", AF_ERR_RUNTIME);
+        }
+
+    } CATCHALL
+
+    return AF_SUCCESS;
+}
+
+#else   // WITH_FREEIMAGE
+#include <af/image.h>
+#include <stdio.h>
+af_err af_load_image_native(af_array *out, const char* filename)
+{
+    printf("Error: Image IO requires FreeImage. See https://github.com/arrayfire/arrayfire\n");
+    return AF_ERR_NOT_CONFIGURED;
+}
+
+af_err af_save_image_native(const char* filename, const af_array in)
+{
+    printf("Error: Image IO requires FreeImage. See https://github.com/arrayfire/arrayfire\n");
+    return AF_ERR_NOT_CONFIGURED;
+}
+#endif  // WITH_FREEIMAGE
diff --git a/src/api/c/imageio_helper.h b/src/api/c/imageio_helper.h
new file mode 100644
index 0000000000..a37973f006
--- /dev/null
+++ b/src/api/c/imageio_helper.h
@@ -0,0 +1,102 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#ifndef IMAGEIO_HELPER_H
+#define IMAGEIO_HELPER_H
+
+#include <FreeImage.h>
+
+#include <af/array.h>
+#include <af/index.h>
+#include <af/dim4.hpp>
+#include <err_common.hpp>
+
+class FI_Manager
+{
+    public:
+    static bool initialized;
+    FI_Manager()
+    {
+#ifdef FREEIMAGE_LIB
+        FreeImage_Initialise();
+#endif
+        initialized = true;
+    }
+
+    ~FI_Manager()
+    {
+#ifdef FREEIMAGE_LIB
+        FreeImage_DeInitialise();
+#endif
+    }
+};
+
+static void FI_Init()
+{
+    static FI_Manager manager = FI_Manager();
+}
+
+class FI_BitmapResource
+{
+public:
+    explicit FI_BitmapResource(FIBITMAP * p) :
+        pBitmap(p)
+    {
+    }
+
+    ~FI_BitmapResource()
+    {
+        FreeImage_Unload(pBitmap);
+    }
+private:
+    FIBITMAP * pBitmap;
+};
+
+typedef enum {
+    AFFI_GRAY = 1,
+    AFFI_RGB  = 3,
+    AFFI_RGBA = 4
+} FI_CHANNELS;
+
+// Error handler for FreeImage library.
+// In case this handler is invoked, it throws an af exception.
+static void FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, const char* zMessage)
+{
+    printf("FreeImage Error Handler: %s\n", zMessage);
+}
+
+//  Split a MxNx3 image into 3 separate channel matrices.
+//  Produce 3 channels if needed
+static af_err channel_split(const af_array rgb, const af::dim4 &dims,
+                            af_array *outr, af_array *outg, af_array *outb, af_array *outa)
+{
+    try {
+        af_seq idx[4][3] = {{af_span, af_span, {0, 0, 1}},
+                            {af_span, af_span, {1, 1, 1}},
+                            {af_span, af_span, {2, 2, 1}},
+                            {af_span, af_span, {3, 3, 1}}
+                           };
+
+        if (dims[2] == 4) {
+            AF_CHECK(af_index(outr, rgb, dims.ndims(), idx[0]));
+            AF_CHECK(af_index(outg, rgb, dims.ndims(), idx[1]));
+            AF_CHECK(af_index(outb, rgb, dims.ndims(), idx[2]));
+            AF_CHECK(af_index(outa, rgb, dims.ndims(), idx[3]));
+        } else if (dims[2] == 3) {
+            AF_CHECK(af_index(outr, rgb, dims.ndims(), idx[0]));
+            AF_CHECK(af_index(outg, rgb, dims.ndims(), idx[1]));
+            AF_CHECK(af_index(outb, rgb, dims.ndims(), idx[2]));
+        } else {
+            AF_CHECK(af_index(outr, rgb, dims.ndims(), idx[0]));
+        }
+    } CATCHALL;
+    return AF_SUCCESS;
+}
+
+#endif
diff --git a/src/api/c/implicit.cpp b/src/api/c/implicit.cpp
index b7a661d67c..372fb9654e 100644
--- a/src/api/c/implicit.cpp
+++ b/src/api/c/implicit.cpp
@@ -47,6 +47,12 @@ af_dtype implicit(const af_dtype lty, const af_dtype rty)
     if ((lty == s32) ||
         (rty == s32)) return s32;
 
+    if ((lty == u16) ||
+        (rty == u16)) return u16;
+
+    if ((lty == s16) ||
+        (rty == s16)) return s16;
+
     if ((lty == u8 ) ||
         (rty == u8 )) return u8;
 
diff --git a/src/api/c/index.cpp b/src/api/c/index.cpp
index a9f276d8aa..b6eb8ab4cd 100644
--- a/src/api/c/index.cpp
+++ b/src/api/c/index.cpp
@@ -60,6 +60,8 @@ af_err af_index(af_array *result, const af_array in, const unsigned ndims, const
         case b8:     indexArray<char>    (out, in, ndims, index);  break;
         case s32:    indexArray<int>     (out, in, ndims, index);  break;
         case u32:    indexArray<unsigned>(out, in, ndims, index);  break;
+        case s16:    indexArray<short>   (out, in, ndims, index);  break;
+        case u16:    indexArray<ushort>  (out, in, ndims, index);  break;
         case s64:    indexArray<intl>    (out, in, ndims, index);  break;
         case u64:    indexArray<uintl>   (out, in, ndims, index);  break;
         case u8:     indexArray<uchar>   (out, in, ndims, index);  break;
@@ -88,6 +90,8 @@ static af_array lookup(const af_array &in, const af_array &idx, const unsigned d
         case u32: return getHandle(lookup<unsigned, idx_t > (getArray<unsigned>(in), getArray<idx_t>(idx), dim));
         case s64: return getHandle(lookup<intl    , idx_t > (getArray<intl    >(in), getArray<idx_t>(idx), dim));
         case u64: return getHandle(lookup<uintl   , idx_t > (getArray<uintl   >(in), getArray<idx_t>(idx), dim));
+        case s16: return getHandle(lookup<short   , idx_t > (getArray<short   >(in), getArray<idx_t>(idx), dim));
+        case u16: return getHandle(lookup<ushort  , idx_t > (getArray<ushort  >(in), getArray<idx_t>(idx), dim));
         case  u8: return getHandle(lookup<uchar   , idx_t > (getArray<uchar   >(in), getArray<idx_t>(idx), dim));
         case  b8: return getHandle(lookup<char    , idx_t > (getArray<char    >(in), getArray<idx_t>(idx), dim));
         default : TYPE_ERROR(1, inType);
@@ -116,6 +120,10 @@ af_err af_lookup(af_array *out, const af_array in, const af_array indices, const
             case f64: output = lookup<double  >(in, indices, dim); break;
             case s32: output = lookup<int     >(in, indices, dim); break;
             case u32: output = lookup<unsigned>(in, indices, dim); break;
+            case s16: output = lookup<short   >(in, indices, dim); break;
+            case u16: output = lookup<ushort  >(in, indices, dim); break;
+            case s64: output = lookup<intl    >(in, indices, dim); break;
+            case u64: output = lookup<uintl   >(in, indices, dim); break;
             case  u8: output = lookup<uchar   >(in, indices, dim); break;
             default : TYPE_ERROR(1, idxType);
         }
@@ -127,12 +135,6 @@ af_err af_lookup(af_array *out, const af_array in, const af_array indices, const
     return AF_SUCCESS;
 }
 
-af_seq
-af_make_seq(double begin, double end, double step) {
-    af_seq seq = {begin, end, step};
-    return seq;
-}
-
 // idxrs parameter to the below static function
 // expects 4 values which is handled appropriately
 // by the C-API af_index_gen
@@ -214,9 +216,11 @@ af_err af_index_gen(af_array *out, const af_array in, const dim_t ndims, const a
             case c32: output = genIndex<cfloat >(in, idxrs); break;
             case f32: output = genIndex<float  >(in, idxrs); break;
             case u64: output = genIndex<uintl  >(in, idxrs); break;
-            case u32: output = genIndex<uint   >(in, idxrs); break;
             case s64: output = genIndex<intl   >(in, idxrs); break;
+            case u32: output = genIndex<uint   >(in, idxrs); break;
             case s32: output = genIndex<int    >(in, idxrs); break;
+            case u16: output = genIndex<ushort >(in, idxrs); break;
+            case s16: output = genIndex<short  >(in, idxrs); break;
             case  u8: output = genIndex<uchar  >(in, idxrs); break;
             case  b8: output = genIndex<char   >(in, idxrs); break;
             default: TYPE_ERROR(1, inType);
diff --git a/src/api/c/join.cpp b/src/api/c/join.cpp
index 67035f3181..2a2b93dd36 100644
--- a/src/api/c/join.cpp
+++ b/src/api/c/join.cpp
@@ -67,6 +67,8 @@ af_err af_join(af_array *out, const int dim, const af_array first, const af_arra
             case u32: output = join<uint   , uint   >(dim, first, second);  break;
             case s64: output = join<intl   , intl   >(dim, first, second);  break;
             case u64: output = join<uintl  , uintl  >(dim, first, second);  break;
+            case s16: output = join<short  , short  >(dim, first, second);  break;
+            case u16: output = join<ushort , ushort >(dim, first, second);  break;
             case u8:  output = join<uchar  , uchar  >(dim, first, second);  break;
             default:  TYPE_ERROR(1, finfo.getType());
         }
@@ -119,6 +121,8 @@ af_err af_join_many(af_array *out, const int dim, const unsigned n_arrays, const
             case u32: output = join_many<uint   >(dim, n_arrays, inputs);  break;
             case s64: output = join_many<intl   >(dim, n_arrays, inputs);  break;
             case u64: output = join_many<uintl  >(dim, n_arrays, inputs);  break;
+            case s16: output = join_many<short  >(dim, n_arrays, inputs);  break;
+            case u16: output = join_many<ushort >(dim, n_arrays, inputs);  break;
             case u8:  output = join_many<uchar  >(dim, n_arrays, inputs);  break;
             default:  TYPE_ERROR(1, info[0].getType());
         }
diff --git a/src/api/c/match_template.cpp b/src/api/c/match_template.cpp
index 4e755e2504..0e618c2bc4 100644
--- a/src/api/c/match_template.cpp
+++ b/src/api/c/match_template.cpp
@@ -60,6 +60,8 @@ af_err af_match_template(af_array *out, const af_array search_img, const af_arra
             case f32: output = match_template<float ,  float>(search_img, template_img, m_type); break;
             case s32: output = match_template<int   ,  float>(search_img, template_img, m_type); break;
             case u32: output = match_template<uint  ,  float>(search_img, template_img, m_type); break;
+            case s16: output = match_template<short ,  float>(search_img, template_img, m_type); break;
+            case u16: output = match_template<ushort,  float>(search_img, template_img, m_type); break;
             case  b8: output = match_template<char  ,  float>(search_img, template_img, m_type); break;
             case  u8: output = match_template<uchar ,  float>(search_img, template_img, m_type); break;
             default : TYPE_ERROR(1, sType);
diff --git a/src/api/c/mean.cpp b/src/api/c/mean.cpp
index 1f71a85a41..1cbee32ec6 100644
--- a/src/api/c/mean.cpp
+++ b/src/api/c/mean.cpp
@@ -22,46 +22,33 @@
 
 using namespace detail;
 
-template<typename inType, typename outType>
-static outType mean(const af_array &in)
+template<typename Ti, typename To>
+static To mean(const af_array &in)
 {
-    Array<outType> input = cast<outType>(getArray<inType>(in));
-    outType result = mean<outType>(input); /* defined in stats.h */
-    return result;
+    /* following function is defined in stats.h */
+    return mean<Ti, To>(getArray<Ti>(in)); /* defined in stats.h */
 }
 
-template<typename inType, typename outType>
-static outType mean(const af_array &in, const af_array &weights)
+template<typename T>
+static T mean(const af_array &in, const af_array &weights)
 {
-    typedef typename baseOutType<outType>::type bType;
-
-    Array<outType> input = cast<outType>(getArray<inType>(in));
-    Array<outType> wts   = cast<outType>(getArray<bType>(weights));
-
-    outType result = mean<outType, bType>(input, getArray<bType>(weights)); /* defined in stats.h */
-
-    return result;
+    typedef typename baseOutType<T>::type Tw;
+    /* following function is defined in stats.h */
+    return mean<T, Tw>(castArray<T>(in), castArray<Tw>(weights));
 }
 
-template<typename inType, typename outType>
+template<typename Ti, typename To>
 static af_array mean(const af_array &in, const dim_t dim)
 {
-    Array<outType> input = cast<outType>(getArray<inType>(in));
-    Array<outType>  result= mean<outType>(input, dim); /* defined in stats.h */
-
-    return getHandle<outType>(result);
+    /* following function is defined in stats.h */
+    return getHandle<To>(mean<Ti, To>(getArray<Ti>(in), dim));
 }
 
-template<typename inType, typename outType>
+template<typename T>
 static af_array mean(const af_array &in, const af_array &weights, const dim_t dim)
 {
-    typedef typename baseOutType<outType>::type bType;
-
-    Array<outType> input = cast<outType>(getArray<inType>(in));
-    Array<outType> wts   = cast<outType>(getArray<bType>(weights));
-    Array<outType> retVal= mean<outType>(input, wts, dim); /* defined in stats.h */
-
-    return getHandle<outType>(retVal);
+    /* following function is defined in stats.h */
+    return getHandle<T>(mean<T>(castArray<T>(in), castArray<T>(weights), dim));
 }
 
 af_err af_mean(af_array *out, const af_array in, const dim_t dim)
@@ -73,16 +60,18 @@ af_err af_mean(af_array *out, const af_array in, const dim_t dim)
         ArrayInfo info = getInfo(in);
         af_dtype type = info.getType();
         switch(type) {
-            case f64: output = mean<double,  double>(in, dim); break;
-            case f32: output = mean<float ,  float >(in, dim); break;
-            case s32: output = mean<int   ,  float >(in, dim); break;
-            case u32: output = mean<uint  ,  float >(in, dim); break;
-            case s64: output = mean<intl  ,  double>(in, dim); break;
-            case u64: output = mean<uintl ,  double>(in, dim); break;
-            case  u8: output = mean<uchar ,  float >(in, dim); break;
-            case  b8: output = mean<char  ,  float >(in, dim); break;
-            case c32: output = mean<cfloat,  cfloat>(in, dim); break;
-            case c64: output = mean<cdouble,cdouble>(in, dim); break;
+            case f64: output = mean<double  ,  double>(in, dim); break;
+            case f32: output = mean<float   ,  float >(in, dim); break;
+            case s32: output = mean<int     ,  float >(in, dim); break;
+            case u32: output = mean<unsigned,  float >(in, dim); break;
+            case s64: output = mean<intl    ,  double>(in, dim); break;
+            case u64: output = mean<uintl   ,  double>(in, dim); break;
+            case s16: output = mean<short   ,  float >(in, dim); break;
+            case u16: output = mean<ushort  ,  float >(in, dim); break;
+            case  u8: output = mean<uchar   ,  float >(in, dim); break;
+            case  b8: output = mean<char    ,  float >(in, dim); break;
+            case c32: output = mean<cfloat  ,  cfloat>(in, dim); break;
+            case c64: output = mean<cdouble , cdouble>(in, dim); break;
             default : TYPE_ERROR(1, type);
         }
         std::swap(*out, output);
@@ -105,16 +94,18 @@ af_err af_mean_weighted(af_array *out, const af_array in, const af_array weights
         ARG_ASSERT(2, (wType==f32 || wType==f64)); /* verify that weights are non-complex real numbers */
 
         switch(iType) {
-            case f64: output = mean<double,  double>(in, weights, dim); break;
-            case f32: output = mean<float ,  float >(in, weights, dim); break;
-            case s32: output = mean<int   ,  float >(in, weights, dim); break;
-            case u32: output = mean<uint  ,  float >(in, weights, dim); break;
-            case s64: output = mean<intl  ,  double>(in, weights, dim); break;
-            case u64: output = mean<uintl ,  double>(in, weights, dim); break;
-            case  u8: output = mean<uchar ,  float >(in, weights, dim); break;
-            case  b8: output = mean<char  ,  float >(in, weights, dim); break;
-            case c32: output = mean<cfloat,  cfloat>(in, weights, dim); break;
-            case c64: output = mean<cdouble,cdouble>(in, weights, dim); break;
+            case f64: output = mean< double>(in, weights, dim); break;
+            case f32: output = mean< float >(in, weights, dim); break;
+            case s32: output = mean< float >(in, weights, dim); break;
+            case u32: output = mean< float >(in, weights, dim); break;
+            case s64: output = mean< double>(in, weights, dim); break;
+            case u64: output = mean< double>(in, weights, dim); break;
+            case s16: output = mean< float >(in, weights, dim); break;
+            case u16: output = mean< float >(in, weights, dim); break;
+            case  u8: output = mean< float >(in, weights, dim); break;
+            case  b8: output = mean< float >(in, weights, dim); break;
+            case c32: output = mean< cfloat>(in, weights, dim); break;
+            case c64: output = mean<cdouble>(in, weights, dim); break;
             default : TYPE_ERROR(1, iType);
         }
         std::swap(*out, output);
@@ -129,21 +120,23 @@ af_err af_mean_all(double *realVal, double *imagVal, const af_array in)
         ArrayInfo info = getInfo(in);
         af_dtype type = info.getType();
         switch(type) {
-            case f64: *realVal = mean<double, double>(in); break;
-            case f32: *realVal = mean<float ,  float>(in); break;
-            case s32: *realVal = mean<int   ,  float>(in); break;
-            case u32: *realVal = mean<uint  ,  float>(in); break;
-            case s64: *realVal = mean<intl  , double>(in); break;
-            case u64: *realVal = mean<uintl , double>(in); break;
-            case  u8: *realVal = mean<uchar ,  float>(in); break;
-            case  b8: *realVal = mean<char  ,  float>(in); break;
+            case f64: *realVal = mean<double  , double>(in); break;
+            case f32: *realVal = mean<float   , float >(in); break;
+            case s32: *realVal = mean<int     , float >(in); break;
+            case u32: *realVal = mean<unsigned, float >(in); break;
+            case s64: *realVal = mean<intl    , double>(in); break;
+            case u64: *realVal = mean<uintl   , double>(in); break;
+            case s16: *realVal = mean<short   , float >(in); break;
+            case u16: *realVal = mean<ushort  , float >(in); break;
+            case  u8: *realVal = mean<uchar   , float >(in); break;
+            case  b8: *realVal = mean<char    , float >(in); break;
             case c32: {
-                cfloat tmp = mean<cfloat,cfloat>(in);
+                cfloat tmp = mean<cfloat, cfloat>(in);
                 *realVal = real(tmp);
                 *imagVal = imag(tmp);
                 } break;
             case c64: {
-                cdouble tmp = mean<cdouble,cdouble>(in);
+                cdouble tmp = mean<cdouble, cdouble>(in);
                 *realVal = real(tmp);
                 *imagVal = imag(tmp);
                 } break;
@@ -165,21 +158,23 @@ af_err af_mean_all_weighted(double *realVal, double *imagVal, const af_array in,
         ARG_ASSERT(3, (wType==f32 || wType==f64)); /* verify that weights are non-complex real numbers */
 
         switch(iType) {
-            case f64: *realVal = mean<double, double>(in, weights); break;
-            case f32: *realVal = mean<float ,  float>(in, weights); break;
-            case s32: *realVal = mean<int   ,  float>(in, weights); break;
-            case u32: *realVal = mean<uint  ,  float>(in, weights); break;
-            case s64: *realVal = mean<intl  , double>(in, weights); break;
-            case u64: *realVal = mean<uintl , double>(in, weights); break;
-            case  u8: *realVal = mean<uchar ,  float>(in, weights); break;
-            case  b8: *realVal = mean<char  ,  float>(in, weights); break;
+            case f64: *realVal = mean<double>(in, weights); break;
+            case f32: *realVal = mean< float>(in, weights); break;
+            case s32: *realVal = mean< float>(in, weights); break;
+            case u32: *realVal = mean< float>(in, weights); break;
+            case s64: *realVal = mean<double>(in, weights); break;
+            case u64: *realVal = mean<double>(in, weights); break;
+            case s16: *realVal = mean< float>(in, weights); break;
+            case u16: *realVal = mean< float>(in, weights); break;
+            case  u8: *realVal = mean< float>(in, weights); break;
+            case  b8: *realVal = mean< float>(in, weights); break;
             case c32: {
-                cfloat tmp = mean<cfloat,cfloat>(in);
+                cfloat tmp = mean<cfloat>(in, weights);
                 *realVal = real(tmp);
                 *imagVal = imag(tmp);
                 } break;
             case c64: {
-                cdouble tmp = mean<cdouble,cdouble>(in);
+                cdouble tmp = mean<cdouble>(in, weights);
                 *realVal = real(tmp);
                 *imagVal = imag(tmp);
                 } break;
diff --git a/src/api/c/meanshift.cpp b/src/api/c/meanshift.cpp
index 6c938548d4..eb4305a5d0 100644
--- a/src/api/c/meanshift.cpp
+++ b/src/api/c/meanshift.cpp
@@ -46,6 +46,10 @@ af_err mean_shift(af_array *out, const af_array in, const float s_sigma, const f
             case b8 : output = mean_shift<char  , is_color>(in, s_sigma, c_sigma, iter); break;
             case s32: output = mean_shift<int   , is_color>(in, s_sigma, c_sigma, iter); break;
             case u32: output = mean_shift<uint  , is_color>(in, s_sigma, c_sigma, iter); break;
+            case s16: output = mean_shift<short , is_color>(in, s_sigma, c_sigma, iter); break;
+            case u16: output = mean_shift<ushort, is_color>(in, s_sigma, c_sigma, iter); break;
+            case s64: output = mean_shift<intl  , is_color>(in, s_sigma, c_sigma, iter); break;
+            case u64: output = mean_shift<uintl , is_color>(in, s_sigma, c_sigma, iter); break;
             case u8 : output = mean_shift<uchar , is_color>(in, s_sigma, c_sigma, iter); break;
             default : TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/median.cpp b/src/api/c/median.cpp
index f8a9b58242..716df78028 100644
--- a/src/api/c/median.cpp
+++ b/src/api/c/median.cpp
@@ -132,6 +132,8 @@ af_err af_median_all(double *realVal, double *imagVal, const af_array in)
             case f32: *realVal = median<float >(in); break;
             case s32: *realVal = median<int   >(in); break;
             case u32: *realVal = median<uint  >(in); break;
+            case s16: *realVal = median<short >(in); break;
+            case u16: *realVal = median<ushort>(in); break;
             case  u8: *realVal = median<uchar >(in); break;
             default : TYPE_ERROR(1, type);
         }
@@ -153,6 +155,8 @@ af_err af_median(af_array* out, const af_array in, const dim_t dim)
             case f32: output = median<float >(in, dim); break;
             case s32: output = median<int   >(in, dim); break;
             case u32: output = median<uint  >(in, dim); break;
+            case s16: output = median<short >(in, dim); break;
+            case u16: output = median<ushort>(in, dim); break;
             case  u8: output = median<uchar >(in, dim); break;
             default : TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp
index e43efa067c..7ccc38c3cb 100644
--- a/src/api/c/moddims.cpp
+++ b/src/api/c/moddims.cpp
@@ -63,6 +63,8 @@ af_err af_moddims(af_array *out, const af_array in,
         case u8:  output = getHandle(modDims<uchar  >(getArray<uchar  >(in), newDims)); break;
         case s64: output = getHandle(modDims<intl   >(getArray<intl   >(in), newDims)); break;
         case u64: output = getHandle(modDims<uintl  >(getArray<uintl  >(in), newDims)); break;
+        case s16: output = getHandle(modDims<short  >(getArray<short  >(in), newDims)); break;
+        case u16: output = getHandle(modDims<ushort >(getArray<ushort >(in), newDims)); break;
         default: TYPE_ERROR(1, type);
         }
         std::swap(*out,output);
diff --git a/src/api/c/morph.cpp b/src/api/c/morph.cpp
index 980097c9f4..bd9c680b26 100644
--- a/src/api/c/morph.cpp
+++ b/src/api/c/morph.cpp
@@ -58,6 +58,8 @@ static af_err morph(af_array *out, const af_array &in, const af_array &mask)
             case b8 : output = morph<char  , isDilation>(in, mask);      break;
             case s32: output = morph<int   , isDilation>(in, mask);      break;
             case u32: output = morph<uint  , isDilation>(in, mask);      break;
+            case s16: output = morph<short , isDilation>(in, mask);      break;
+            case u16: output = morph<ushort, isDilation>(in, mask);      break;
             case u8 : output = morph<uchar , isDilation>(in, mask);      break;
             default : TYPE_ERROR(1, type);
         }
@@ -90,6 +92,8 @@ static af_err morph3d(af_array *out, const af_array &in, const af_array &mask)
             case b8 : output = morph3d<char  , isDilation>(in, mask);       break;
             case s32: output = morph3d<int   , isDilation>(in, mask);       break;
             case u32: output = morph3d<uint  , isDilation>(in, mask);       break;
+            case s16: output = morph3d<short , isDilation>(in, mask);       break;
+            case u16: output = morph3d<ushort, isDilation>(in, mask);       break;
             case u8 : output = morph3d<uchar , isDilation>(in, mask);       break;
             default : TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/nearest_neighbour.cpp b/src/api/c/nearest_neighbour.cpp
index d47e0ae074..03064a4cb7 100644
--- a/src/api/c/nearest_neighbour.cpp
+++ b/src/api/c/nearest_neighbour.cpp
@@ -57,16 +57,17 @@ af_err af_nearest_neighbour(af_array* idx, af_array* dist,
         ARG_ASSERT(6, dist_type == AF_SAD || dist_type == AF_SSD || dist_type == AF_SHD);
         TYPE_ASSERT(qType == tType);
 
-        // For Hamming, only u8, u32 and u64 allowed.
+        // For Hamming, only u8, u16, u32 and u64 allowed.
         af_array oIdx;
         af_array oDist;
 
         if(dist_type == AF_SHD) {
-            TYPE_ASSERT(qType == u8 || qType == u32 || qType == u64);
+            TYPE_ASSERT(qType == u8 || qType == u16 || qType == u32 || qType == u64);
             switch(qType) {
-                case u8:  nearest_neighbour<uchar, uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
-                case u32: nearest_neighbour<uint , uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
-                case u64: nearest_neighbour<uintl, uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case u8:  nearest_neighbour<uchar , uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, AF_SHD); break;
+                case u16: nearest_neighbour<ushort, uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, AF_SHD); break;
+                case u32: nearest_neighbour<uint  , uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, AF_SHD); break;
+                case u64: nearest_neighbour<uintl , uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, AF_SHD); break;
                 default : TYPE_ERROR(1, qType);
             }
         } else {
@@ -77,6 +78,8 @@ af_err af_nearest_neighbour(af_array* idx, af_array* dist,
                 case u32: nearest_neighbour<uint  , uint  >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
                 case s64: nearest_neighbour<intl  , intl  >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
                 case u64: nearest_neighbour<uintl , uintl >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case s16: nearest_neighbour<short , int   >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case u16: nearest_neighbour<ushort, uint  >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
                 case u8:  nearest_neighbour<uchar , uint  >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
                 default : TYPE_ERROR(1, qType);
             }
diff --git a/src/api/c/plot.cpp b/src/api/c/plot.cpp
index 0723a30eff..b22e92850b 100644
--- a/src/api/c/plot.cpp
+++ b/src/api/c/plot.cpp
@@ -49,8 +49,7 @@ fg::Plot* setup_plot(const af_array X, const af_array Y)
     fg::Plot* plot = fgMngr.getPlot(X_dims.elements(), getGLType<T>());
     plot->setColor(1.0, 0.0, 0.0);
     plot->setAxesLimits(xmax, xmin, ymax, ymin);
-    plot->setXAxisTitle("X Axis");
-    plot->setYAxisTitle("Y Axis");
+    plot->setAxesTitles("X Axis", "Y Axis");
 
     copy_plot<T>(P, plot);
 
@@ -89,6 +88,8 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co
             case f32: plot = setup_plot<float  >(X, Y); break;
             case s32: plot = setup_plot<int    >(X, Y); break;
             case u32: plot = setup_plot<uint   >(X, Y); break;
+            case s16: plot = setup_plot<short  >(X, Y); break;
+            case u16: plot = setup_plot<ushort >(X, Y); break;
             case u8 : plot = setup_plot<uchar  >(X, Y); break;
             default:  TYPE_ERROR(1, Xtype);
         }
diff --git a/src/api/c/plot3.cpp b/src/api/c/plot3.cpp
new file mode 100644
index 0000000000..473bce0b96
--- /dev/null
+++ b/src/api/c/plot3.cpp
@@ -0,0 +1,113 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/graphics.h>
+#include <af/image.h>
+
+#include <ArrayInfo.hpp>
+#include <graphics_common.hpp>
+#include <err_common.hpp>
+#include <backend.hpp>
+#include <plot3.hpp>
+#include <reduce.hpp>
+#include <join.hpp>
+#include <transpose.hpp>
+#include <reorder.hpp>
+#include <handle.hpp>
+#include <af/data.h>
+#include <iostream>
+
+using af::dim4;
+using namespace detail;
+
+#if defined(WITH_GRAPHICS)
+using namespace graphics;
+
+template<typename T>
+fg::Plot3* setup_plot3(const af_array P)
+{
+    Array<T> pIn = getArray<T>(P);
+    ArrayInfo Pinfo = getInfo(P);
+    af::dim4 P_dims = Pinfo.dims();
+
+    DIM_ASSERT(0, Pinfo.ndims() == 1 || Pinfo.ndims() == 2);
+    DIM_ASSERT(0, (P_dims[0] == 3 || P_dims[1] == 3) ||
+                    (Pinfo.isVector() && P_dims[0]%3 == 0));
+
+    if(Pinfo.isVector()){
+        dim4 rdims(P_dims.elements()/3, 3, 1, 1);
+        pIn.modDims(rdims);
+        P_dims = pIn.dims();
+    }
+
+    T max[3], min[3];
+    if(P_dims[0] == 3) {
+        af_get_data_ptr(max, getHandle(reduce<af_max_t, T, T>(pIn, 1)));
+        af_get_data_ptr(min, getHandle(reduce<af_min_t, T, T>(pIn, 1)));
+    }
+
+    if(P_dims[1] == 3) {
+        af_get_data_ptr(max, getHandle(reduce<af_max_t, T, T>(pIn, 0)));
+        af_get_data_ptr(min, getHandle(reduce<af_min_t, T, T>(pIn, 0)));
+    }
+
+    ForgeManager& fgMngr = ForgeManager::getInstance();
+    fg::Plot3* plot3 = fgMngr.getPlot3(P_dims.elements()/3, getGLType<T>());
+    plot3->setColor(1.0, 0.0, 0.0);
+    plot3->setAxesLimits(max[0], min[0],
+                         max[1], min[1],
+                         max[2], min[2]);
+    plot3->setAxesTitles("X Axis", "Y Axis", "Z Axis");
+
+    if(P_dims[1] == 3){
+        pIn = transpose(pIn, false);
+    }
+    copy_plot3<T>(pIn, plot3);
+
+    return plot3;
+}
+#endif
+
+af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props)
+{
+#if defined(WITH_GRAPHICS)
+    if(wind==0) {
+        std::cerr<<"Not a valid window"<<std::endl;
+        return AF_SUCCESS;
+    }
+
+    try {
+        ArrayInfo Pinfo = getInfo(P);
+        af_dtype Ptype  = Pinfo.getType();
+
+        fg::Window* window = reinterpret_cast<fg::Window*>(wind);
+        window->makeCurrent();
+        fg::Plot3* plot3 = NULL;
+
+        switch(Ptype) {
+            case f32: plot3 = setup_plot3<float >(P); break;
+            case s32: plot3 = setup_plot3<int   >(P); break;
+            case u32: plot3 = setup_plot3<uint  >(P); break;
+            case s16: plot3 = setup_plot3<short >(P); break;
+            case u16: plot3 = setup_plot3<ushort>(P); break;
+            case u8 : plot3 = setup_plot3<uchar >(P); break;
+            default:  TYPE_ERROR(1, Ptype);
+        }
+
+        if (props->col>-1 && props->row>-1)
+            window->draw(props->col, props->row, *plot3, props->title);
+        else
+            window->draw(*plot3);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+#else
+    return AF_ERR_NO_GFX;
+#endif
+}
diff --git a/src/api/c/print.cpp b/src/api/c/print.cpp
index eb6dd05705..a5c178cf0c 100644
--- a/src/api/c/print.cpp
+++ b/src/api/c/print.cpp
@@ -111,6 +111,8 @@ af_err af_print_array(af_array arr)
         case u8:    print<uchar>   (NULL, arr, 4);   break;
         case s64:   print<intl>    (NULL, arr, 4);   break;
         case u64:   print<uintl>   (NULL, arr, 4);   break;
+        case s16:   print<short>   (NULL, arr, 4);   break;
+        case u16:   print<ushort>  (NULL, arr, 4);   break;
         default:    TYPE_ERROR(1, type);
         }
     }
@@ -136,6 +138,8 @@ af_err af_print_array_gen(const char *exp, const af_array arr, const int precisi
         case u8:    print<uchar   >(exp, arr, precision);   break;
         case s64:   print<intl    >(exp, arr, precision);   break;
         case u64:   print<uintl   >(exp, arr, precision);   break;
+        case s16:   print<short   >(exp, arr, precision);   break;
+        case u16:   print<ushort  >(exp, arr, precision);   break;
         default:    TYPE_ERROR(1, type);
         }
     }
@@ -163,6 +167,8 @@ af_err af_array_to_string(char **output, const char *exp, const af_array arr,
         case u8:    print<uchar   >(exp, arr, precision, ss, transpose);   break;
         case s64:   print<intl    >(exp, arr, precision, ss, transpose);   break;
         case u64:   print<uintl   >(exp, arr, precision, ss, transpose);   break;
+        case s16:   print<short   >(exp, arr, precision, ss, transpose);   break;
+        case u16:   print<ushort  >(exp, arr, precision, ss, transpose);   break;
         default:    TYPE_ERROR(1, type);
         }
         std::string str = ss.str();
diff --git a/src/api/c/reduce.cpp b/src/api/c/reduce.cpp
index cedf4f90cd..3fe30be9c0 100644
--- a/src/api/c/reduce.cpp
+++ b/src/api/c/reduce.cpp
@@ -56,6 +56,8 @@ static af_err reduce_type(af_array *out, const af_array in, const int dim)
         case s32:  res = reduce<op, int    , To>(in, dim); break;
         case u64:  res = reduce<op, uintl  , To>(in, dim); break;
         case s64:  res = reduce<op, intl   , To>(in, dim); break;
+        case u16:  res = reduce<op, ushort , To>(in, dim); break;
+        case s16:  res = reduce<op, short  , To>(in, dim); break;
         case b8:   res = reduce<op, char   , To>(in, dim); break;
         case u8:   res = reduce<op, uchar  , To>(in, dim); break;
         default:   TYPE_ERROR(1, type);
@@ -95,6 +97,8 @@ static af_err reduce_common(af_array *out, const af_array in, const int dim)
         case s32:  res = reduce<op, int    , int    >(in, dim); break;
         case u64:  res = reduce<op, uintl  , uintl  >(in, dim); break;
         case s64:  res = reduce<op, intl   , intl   >(in, dim); break;
+        case u16:  res = reduce<op, ushort , ushort >(in, dim); break;
+        case s16:  res = reduce<op, short  , short  >(in, dim); break;
         case b8:   res = reduce<op, char   , char   >(in, dim); break;
         case u8:   res = reduce<op, uchar  , uchar  >(in, dim); break;
         default:   TYPE_ERROR(1, type);
@@ -135,6 +139,8 @@ static af_err reduce_promote(af_array *out, const af_array in, const int dim,
         case s32:  res = reduce<op, int    , int    >(in, dim, change_nan, nanval); break;
         case u64:  res = reduce<op, uintl  , uintl  >(in, dim, change_nan, nanval); break;
         case s64:  res = reduce<op, intl   , intl   >(in, dim, change_nan, nanval); break;
+        case u16:  res = reduce<op, ushort , uint   >(in, dim, change_nan, nanval); break;
+        case s16:  res = reduce<op, short  , int    >(in, dim, change_nan, nanval); break;
         case u8:   res = reduce<op, uchar  , uint   >(in, dim, change_nan, nanval); break;
             // Make sure you are adding only "1" for every non zero value, even if op == af_add_t
         case b8:   res = reduce<af_notzero_t, char  , uint   >(in, dim, change_nan, nanval); break;
@@ -219,6 +225,8 @@ static af_err reduce_all_type(double *real, double *imag, const af_array in)
         case s32:  *real = (double)reduce_all<op, int    , To>(in); break;
         case u64:  *real = (double)reduce_all<op, uintl  , To>(in); break;
         case s64:  *real = (double)reduce_all<op, intl   , To>(in); break;
+        case u16:  *real = (double)reduce_all<op, ushort , To>(in); break;
+        case s16:  *real = (double)reduce_all<op, short  , To>(in); break;
         case b8:   *real = (double)reduce_all<op, char   , To>(in); break;
         case u8:   *real = (double)reduce_all<op, uchar  , To>(in); break;
         default:   TYPE_ERROR(1, type);
@@ -252,6 +260,8 @@ static af_err reduce_all_common(double *real_val, double *imag_val, const af_arr
         case s32:  *real_val = (double)reduce_all<op, int    , int    >(in); break;
         case u64:  *real_val = (double)reduce_all<op, uintl  , uintl  >(in); break;
         case s64:  *real_val = (double)reduce_all<op, intl   , intl   >(in); break;
+        case u16:  *real_val = (double)reduce_all<op, ushort , ushort >(in); break;
+        case s16:  *real_val = (double)reduce_all<op, short  , short  >(in); break;
         case b8:   *real_val = (double)reduce_all<op, char   , char   >(in); break;
         case u8:   *real_val = (double)reduce_all<op, uchar  , uchar  >(in); break;
 
@@ -301,6 +311,8 @@ static af_err reduce_all_promote(double *real_val, double *imag_val, const af_ar
         case s32: *real_val = (double)reduce_all<op, int    , int    >(in, change_nan, nanval); break;
         case u64: *real_val = (double)reduce_all<op, uintl  , uintl  >(in, change_nan, nanval); break;
         case s64: *real_val = (double)reduce_all<op, intl   , intl   >(in, change_nan, nanval); break;
+        case u16: *real_val = (double)reduce_all<op, ushort , uint   >(in, change_nan, nanval); break;
+        case s16: *real_val = (double)reduce_all<op, short  , int    >(in, change_nan, nanval); break;
         case u8:  *real_val = (double)reduce_all<op, uchar  , uint   >(in, change_nan, nanval); break;
             // Make sure you are adding only "1" for every non zero value, even if op == af_add_t
         case b8:  *real_val = (double)reduce_all<af_notzero_t, char, uint>(in, change_nan, nanval); break;
@@ -405,6 +417,8 @@ static af_err ireduce_common(af_array *val, af_array *idx, const af_array in, co
         case s32:  ireduce<op, int    >(&res, &loc, in, dim); break;
         case u64:  ireduce<op, uintl  >(&res, &loc, in, dim); break;
         case s64:  ireduce<op, intl   >(&res, &loc, in, dim); break;
+        case u16:  ireduce<op, ushort >(&res, &loc, in, dim); break;
+        case s16:  ireduce<op, short  >(&res, &loc, in, dim); break;
         case b8:   ireduce<op, char   >(&res, &loc, in, dim); break;
         case u8:   ireduce<op, uchar  >(&res, &loc, in, dim); break;
         default:   TYPE_ERROR(1, type);
@@ -457,6 +471,8 @@ static af_err ireduce_all_common(double *real_val, double *imag_val,
         case s32:  *real_val = (double)ireduce_all<op, int   >(loc, in); break;
         case u64:  *real_val = (double)ireduce_all<op, uintl >(loc, in); break;
         case s64:  *real_val = (double)ireduce_all<op, intl  >(loc, in); break;
+        case u16:  *real_val = (double)ireduce_all<op, ushort>(loc, in); break;
+        case s16:  *real_val = (double)ireduce_all<op, short >(loc, in); break;
         case b8:   *real_val = (double)ireduce_all<op, char  >(loc, in); break;
         case u8:   *real_val = (double)ireduce_all<op, uchar >(loc, in); break;
 
diff --git a/src/api/c/regions.cpp b/src/api/c/regions.cpp
index 4245eac8d5..49ddedf88c 100644
--- a/src/api/c/regions.cpp
+++ b/src/api/c/regions.cpp
@@ -46,6 +46,8 @@ af_err af_regions(af_array *out, const af_array in, const af_connectivity connec
             case f64: output = regions<double>(in, connectivity); break;
             case s32: output = regions<int   >(in, connectivity); break;
             case u32: output = regions<uint  >(in, connectivity); break;
+            case s16: output = regions<short >(in, connectivity); break;
+            case u16: output = regions<ushort>(in, connectivity); break;
             default : TYPE_ERROR(0, type);
         }
         std::swap(*out, output);
diff --git a/src/api/c/reorder.cpp b/src/api/c/reorder.cpp
index 733981cad8..10d2cc31d1 100644
--- a/src/api/c/reorder.cpp
+++ b/src/api/c/reorder.cpp
@@ -71,6 +71,8 @@ af_err af_reorder(af_array *out, const af_array in, const af::dim4 &rdims)
             case u8:  output = reorder<uchar  >(in, rdims);  break;
             case s64: output = reorder<intl   >(in, rdims);  break;
             case u64: output = reorder<uintl  >(in, rdims);  break;
+            case s16: output = reorder<short  >(in, rdims);  break;
+            case u16: output = reorder<ushort >(in, rdims);  break;
             default:  TYPE_ERROR(1, type);
         }
         std::swap(*out,output);
diff --git a/src/api/c/replace.cpp b/src/api/c/replace.cpp
index 1f37988e28..7c0a3cf863 100644
--- a/src/api/c/replace.cpp
+++ b/src/api/c/replace.cpp
@@ -59,6 +59,8 @@ af_err af_replace(af_array a, const af_array cond, const af_array b)
         case u32: replace<uint   >(a, cond, b); break;
         case s64: replace<intl   >(a, cond, b); break;
         case u64: replace<uintl  >(a, cond, b); break;
+        case s16: replace<short  >(a, cond, b); break;
+        case u16: replace<ushort >(a, cond, b); break;
         case u8:  replace<uchar  >(a, cond, b); break;
         case b8:  replace<char   >(a, cond, b); break;
         default:  TYPE_ERROR(2, ainfo.getType());
@@ -99,6 +101,8 @@ af_err af_replace_scalar(af_array a, const af_array cond, const double b)
         case u32: replace_scalar<uint   >(a, cond, b); break;
         case s64: replace_scalar<intl   >(a, cond, b); break;
         case u64: replace_scalar<uintl  >(a, cond, b); break;
+        case s16: replace_scalar<short  >(a, cond, b); break;
+        case u16: replace_scalar<ushort >(a, cond, b); break;
         case u8:  replace_scalar<uchar  >(a, cond, b); break;
         case b8:  replace_scalar<char   >(a, cond, b); break;
         default:  TYPE_ERROR(2, ainfo.getType());
diff --git a/src/api/c/resize.cpp b/src/api/c/resize.cpp
index 419af850b0..d17bd291f5 100644
--- a/src/api/c/resize.cpp
+++ b/src/api/c/resize.cpp
@@ -50,6 +50,8 @@ af_err af_resize(af_array *out, const af_array in, const dim_t odim0, const dim_
             case u32: output = resize<uint   >(in, odim0, odim1, method);  break;
             case s64: output = resize<intl   >(in, odim0, odim1, method);  break;
             case u64: output = resize<uintl  >(in, odim0, odim1, method);  break;
+            case s16: output = resize<short  >(in, odim0, odim1, method);  break;
+            case u16: output = resize<ushort >(in, odim0, odim1, method);  break;
             case u8:  output = resize<uchar  >(in, odim0, odim1, method);  break;
             case b8:  output = resize<char   >(in, odim0, odim1, method);  break;
             default:  TYPE_ERROR(1, type);
diff --git a/src/api/c/rgb_gray.cpp b/src/api/c/rgb_gray.cpp
index 0ed5eb9583..1e52ae0899 100644
--- a/src/api/c/rgb_gray.cpp
+++ b/src/api/c/rgb_gray.cpp
@@ -122,6 +122,8 @@ af_err convert(af_array* out, const af_array in, const float r, const float g, c
             case f32: output = convert<float , float , isRGB2GRAY>(in, r, g, b); break;
             case u32: output = convert<uint  , float , isRGB2GRAY>(in, r, g, b); break;
             case s32: output = convert<int   , float , isRGB2GRAY>(in, r, g, b); break;
+            case u16: output = convert<ushort, float , isRGB2GRAY>(in, r, g, b); break;
+            case s16: output = convert<short , float , isRGB2GRAY>(in, r, g, b); break;
             case u8:  output = convert<uchar , float , isRGB2GRAY>(in, r, g, b); break;
             default: TYPE_ERROR(1, iType); break;
         }
diff --git a/src/api/c/rotate.cpp b/src/api/c/rotate.cpp
index b792239634..a5978e3e61 100644
--- a/src/api/c/rotate.cpp
+++ b/src/api/c/rotate.cpp
@@ -63,6 +63,8 @@ af_err af_rotate(af_array *out, const af_array in, const float theta,
             case u32: output = rotate<uint   >(in, theta, odims, method);  break;
             case s64: output = rotate<intl   >(in, theta, odims, method);  break;
             case u64: output = rotate<uintl  >(in, theta, odims, method);  break;
+            case s16: output = rotate<short  >(in, theta, odims, method);  break;
+            case u16: output = rotate<ushort >(in, theta, odims, method);  break;
             case u8:  output = rotate<uchar  >(in, theta, odims, method);  break;
             case b8:  output = rotate<uchar  >(in, theta, odims, method);  break;
             default:  TYPE_ERROR(1, itype);
diff --git a/src/api/c/sat.cpp b/src/api/c/sat.cpp
index 65a44815b9..fa6d0a4c23 100644
--- a/src/api/c/sat.cpp
+++ b/src/api/c/sat.cpp
@@ -47,6 +47,8 @@ af_err af_sat(af_array* out, const af_array in)
             case  u8: output = sat<uint  , uchar >(in); break;
             case s64: output = sat<intl  , intl  >(in); break;
             case u64: output = sat<uintl , uintl >(in); break;
+            case s16: output = sat<int   , short >(in); break;
+            case u16: output = sat<uint  , ushort>(in); break;
             default: TYPE_ERROR(1, inputType);
         }
         std::swap(*out, output);
diff --git a/src/api/c/scan.cpp b/src/api/c/scan.cpp
index d0c9e8e6df..321324be83 100644
--- a/src/api/c/scan.cpp
+++ b/src/api/c/scan.cpp
@@ -53,6 +53,8 @@ af_err af_accum(af_array *out, const af_array in, const int dim)
         case s32:  res = scan<af_add_t, int    , int    >(in, dim); break;
         case u64:  res = scan<af_add_t, uintl  , uintl  >(in, dim); break;
         case s64:  res = scan<af_add_t, intl   , intl   >(in, dim); break;
+        case u16:  res = scan<af_add_t, ushort , uint   >(in, dim); break;
+        case s16:  res = scan<af_add_t, short  , int    >(in, dim); break;
         case u8:   res = scan<af_add_t, uchar  , uint   >(in, dim); break;
         // Make sure you are adding only "1" for every non zero value, even if op == af_add_t
         case b8:   res = scan<af_notzero_t, char  , uint   >(in, dim); break;
diff --git a/src/api/c/select.cpp b/src/api/c/select.cpp
index 06eef2aade..42eb91b806 100644
--- a/src/api/c/select.cpp
+++ b/src/api/c/select.cpp
@@ -63,6 +63,8 @@ af_err af_select(af_array *out, const af_array cond, const af_array a, const af_
         case u32: res = select<uint   >(cond, a, b, odims); break;
         case s64: res = select<intl   >(cond, a, b, odims); break;
         case u64: res = select<uintl  >(cond, a, b, odims); break;
+        case s16: res = select<short  >(cond, a, b, odims); break;
+        case u16: res = select<ushort >(cond, a, b, odims); break;
         case u8:  res = select<uchar  >(cond, a, b, odims); break;
         case b8:  res = select<char   >(cond, a, b, odims); break;
         default:  TYPE_ERROR(2, ainfo.getType());
@@ -106,6 +108,8 @@ af_err af_select_scalar_r(af_array *out, const af_array cond, const af_array a,
         case c64: res = select_scalar<cdouble, false>(cond, a, b, adims); break;
         case s32: res = select_scalar<int    , false>(cond, a, b, adims); break;
         case u32: res = select_scalar<uint   , false>(cond, a, b, adims); break;
+        case s16: res = select_scalar<short  , false>(cond, a, b, adims); break;
+        case u16: res = select_scalar<ushort , false>(cond, a, b, adims); break;
         case s64: res = select_scalar<intl   , false>(cond, a, b, adims); break;
         case u64: res = select_scalar<uintl  , false>(cond, a, b, adims); break;
         case u8:  res = select_scalar<uchar  , false>(cond, a, b, adims); break;
@@ -143,6 +147,8 @@ af_err af_select_scalar_l(af_array *out, const af_array cond, const double a, co
         case c64: res = select_scalar<cdouble, true >(cond, b, a, bdims); break;
         case s32: res = select_scalar<int    , true >(cond, b, a, bdims); break;
         case u32: res = select_scalar<uint   , true >(cond, b, a, bdims); break;
+        case s16: res = select_scalar<short  , true >(cond, b, a, bdims); break;
+        case u16: res = select_scalar<ushort , true >(cond, b, a, bdims); break;
         case s64: res = select_scalar<intl   , true >(cond, b, a, bdims); break;
         case u64: res = select_scalar<uintl  , true >(cond, b, a, bdims); break;
         case u8:  res = select_scalar<uchar  , true >(cond, b, a, bdims); break;
diff --git a/src/api/c/set.cpp b/src/api/c/set.cpp
index 1200eaef32..1643fad95b 100644
--- a/src/api/c/set.cpp
+++ b/src/api/c/set.cpp
@@ -36,6 +36,10 @@ af_err af_set_unique(af_array *out, const af_array in, const bool is_sorted)
         case f64: res = setUnique<double >(in, is_sorted); break;
         case s32: res = setUnique<int    >(in, is_sorted); break;
         case u32: res = setUnique<uint   >(in, is_sorted); break;
+        case s16: res = setUnique<short  >(in, is_sorted); break;
+        case u16: res = setUnique<ushort >(in, is_sorted); break;
+        case s64: res = setUnique<intl   >(in, is_sorted); break;
+        case u64: res = setUnique<uintl  >(in, is_sorted); break;
         case b8:  res = setUnique<char   >(in, is_sorted); break;
         case u8:  res = setUnique<uchar  >(in, is_sorted); break;
         default: TYPE_ERROR(1, type);
@@ -69,6 +73,10 @@ af_err af_set_union(af_array *out, const af_array first, const af_array second,
         case f64: res = setUnion<double >(first, second, is_unique); break;
         case s32: res = setUnion<int    >(first, second, is_unique); break;
         case u32: res = setUnion<uint   >(first, second, is_unique); break;
+        case s16: res = setUnion<short  >(first, second, is_unique); break;
+        case u16: res = setUnion<ushort >(first, second, is_unique); break;
+        case s64: res = setUnion<intl   >(first, second, is_unique); break;
+        case u64: res = setUnion<uintl  >(first, second, is_unique); break;
         case b8:  res = setUnion<char   >(first, second, is_unique); break;
         case u8:  res = setUnion<uchar  >(first, second, is_unique); break;
         default: TYPE_ERROR(1, first_type);
@@ -101,6 +109,10 @@ af_err af_set_intersect(af_array *out, const af_array first, const af_array seco
         case f64: res = setIntersect<double >(first, second, is_unique); break;
         case s32: res = setIntersect<int    >(first, second, is_unique); break;
         case u32: res = setIntersect<uint   >(first, second, is_unique); break;
+        case s16: res = setIntersect<short  >(first, second, is_unique); break;
+        case u16: res = setIntersect<ushort >(first, second, is_unique); break;
+        case s64: res = setIntersect<intl   >(first, second, is_unique); break;
+        case u64: res = setIntersect<uintl  >(first, second, is_unique); break;
         case b8:  res = setIntersect<char   >(first, second, is_unique); break;
         case u8:  res = setIntersect<uchar  >(first, second, is_unique); break;
         default: TYPE_ERROR(1, first_type);
diff --git a/src/api/c/shift.cpp b/src/api/c/shift.cpp
index 28e21804cf..e383915e0a 100644
--- a/src/api/c/shift.cpp
+++ b/src/api/c/shift.cpp
@@ -43,6 +43,8 @@ af_err af_shift(af_array *out, const af_array in, const int sdims[4])
             case u32: output = shift<uint   >(in, sdims);  break;
             case s64: output = shift<intl   >(in, sdims);  break;
             case u64: output = shift<uintl  >(in, sdims);  break;
+            case s16: output = shift<short  >(in, sdims);  break;
+            case u16: output = shift<ushort >(in, sdims);  break;
             case u8:  output = shift<uchar  >(in, sdims);  break;
             default:  TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/sift.cpp b/src/api/c/sift.cpp
index ef68c30556..c7a38582aa 100644
--- a/src/api/c/sift.cpp
+++ b/src/api/c/sift.cpp
@@ -23,7 +23,8 @@ using namespace detail;
 template<typename T, typename convAccT>
 static void sift(af_features& feat_, af_array& descriptors, const af_array& in, const unsigned n_layers,
                  const float contrast_thr, const float edge_thr, const float init_sigma,
-                 const bool double_input, const float img_scale, const float feature_ratio)
+                 const bool double_input, const float img_scale, const float feature_ratio,
+                 const bool compute_GLOH)
 {
     Array<float> x     = createEmptyArray<float>(dim4());
     Array<float> y     = createEmptyArray<float>(dim4());
@@ -36,7 +37,7 @@ static void sift(af_features& feat_, af_array& descriptors, const af_array& in,
 
     feat.n = sift<T, convAccT>(x, y, score, ori, size, desc, getArray<T>(in),
                                n_layers, contrast_thr, edge_thr, init_sigma,
-                               double_input, img_scale, feature_ratio);
+                               double_input, img_scale, feature_ratio, compute_GLOH);
 
     feat.x           = getHandle(x);
     feat.y           = getHandle(y);
@@ -73,15 +74,56 @@ af_err af_sift(af_features* feat, af_array* desc, const af_array in, const unsig
         switch(type) {
             case f32: sift<float , float >(*feat, tmp_desc, in, n_layers, contrast_thr,
                                            edge_thr, init_sigma, double_input,
-                                           img_scale, feature_ratio); break;
+                                           img_scale, feature_ratio, false); break;
             case f64: sift<double, double>(*feat, tmp_desc, in, n_layers, contrast_thr,
                                            edge_thr, init_sigma, double_input,
-                                           img_scale, feature_ratio); break;
+                                           img_scale, feature_ratio, false); break;
             default : TYPE_ERROR(1, type);
         }
         std::swap(*desc, tmp_desc);
 #else
-        AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AFF_ERR_NONFREE);
+        AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AF_ERR_NONFREE);
+#endif
+    }
+    CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+af_err af_gloh(af_features* feat, af_array* desc, const af_array in, const unsigned n_layers,
+               const float contrast_thr, const float edge_thr, const float init_sigma,
+               const bool double_input, const float img_scale, const float feature_ratio)
+{
+    try {
+#ifdef AF_BUILD_SIFT
+        ArrayInfo info = getInfo(in);
+        af::dim4 dims  = info.dims();
+
+        ARG_ASSERT(2, (dims[0] >= 15 && dims[1] >= 15 && dims[2] == 1 && dims[3] == 1));
+        ARG_ASSERT(3, n_layers > 0);
+        ARG_ASSERT(4, contrast_thr > 0.0f);
+        ARG_ASSERT(5, edge_thr >= 1.0f);
+        ARG_ASSERT(6, init_sigma > 0.5f);
+        ARG_ASSERT(8, img_scale > 0.0f);
+        ARG_ASSERT(9, feature_ratio > 0.0f);
+
+        dim_t in_ndims = dims.ndims();
+        DIM_ASSERT(1, (in_ndims <= 3 && in_ndims >= 2));
+
+        af_array tmp_desc;
+        af_dtype type  = info.getType();
+        switch(type) {
+            case f32: sift<float , float >(*feat, tmp_desc, in, n_layers, contrast_thr,
+                                           edge_thr, init_sigma, double_input,
+                                           img_scale, feature_ratio, true); break;
+            case f64: sift<double, double>(*feat, tmp_desc, in, n_layers, contrast_thr,
+                                           edge_thr, init_sigma, double_input,
+                                           img_scale, feature_ratio, true); break;
+            default : TYPE_ERROR(1, type);
+        }
+        std::swap(*desc, tmp_desc);
+#else
+        AF_ERROR("ArrayFire was not built with nonfree support, GLOH disabled\n", AF_ERR_NONFREE);
 #endif
     }
     CATCHALL;
diff --git a/src/api/c/sobel.cpp b/src/api/c/sobel.cpp
index 594bf65a14..6d28a6a95d 100644
--- a/src/api/c/sobel.cpp
+++ b/src/api/c/sobel.cpp
@@ -48,6 +48,8 @@ af_err af_sobel_operator(af_array *dx, af_array *dy, const af_array img, const u
             case f64: output = sobelDerivatives<double, double>(img, ker_size); break;
             case s32: output = sobelDerivatives<int   , int>   (img, ker_size); break;
             case u32: output = sobelDerivatives<uint  , int>   (img, ker_size); break;
+            case s16: output = sobelDerivatives<short , int>   (img, ker_size); break;
+            case u16: output = sobelDerivatives<ushort, int>   (img, ker_size); break;
             case b8 : output = sobelDerivatives<char  , int>   (img, ker_size); break;
             case u8:  output = sobelDerivatives<uchar , int>   (img, ker_size); break;
             default : TYPE_ERROR(1, type);
diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp
index 39a7f227b3..1de63c5052 100644
--- a/src/api/c/sort.cpp
+++ b/src/api/c/sort.cpp
@@ -52,6 +52,10 @@ af_err af_sort(af_array *out, const af_array in, const unsigned dim, const bool
             case f64: val = sort<double >(in, dim, isAscending);  break;
             case s32: val = sort<int    >(in, dim, isAscending);  break;
             case u32: val = sort<uint   >(in, dim, isAscending);  break;
+            case s16: val = sort<short  >(in, dim, isAscending);  break;
+            case u16: val = sort<ushort >(in, dim, isAscending);  break;
+            case s64: val = sort<intl   >(in, dim, isAscending);  break;
+            case u64: val = sort<uintl  >(in, dim, isAscending);  break;
             case u8:  val = sort<uchar  >(in, dim, isAscending);  break;
             case b8:  val = sort<char   >(in, dim, isAscending);  break;
             default:  TYPE_ERROR(1, type);
@@ -100,6 +104,10 @@ af_err af_sort_index(af_array *out, af_array *indices, const af_array in, const
             case f64: sort_index<double >(&val, &idx, in, dim, isAscending);  break;
             case s32: sort_index<int    >(&val, &idx, in, dim, isAscending);  break;
             case u32: sort_index<uint   >(&val, &idx, in, dim, isAscending);  break;
+            case s16: sort_index<short  >(&val, &idx, in, dim, isAscending);  break;
+            case u16: sort_index<ushort >(&val, &idx, in, dim, isAscending);  break;
+            case s64: sort_index<intl   >(&val, &idx, in, dim, isAscending);  break;
+            case u64: sort_index<uintl  >(&val, &idx, in, dim, isAscending);  break;
             case u8:  sort_index<uchar  >(&val, &idx, in, dim, isAscending);  break;
             case b8:  sort_index<char   >(&val, &idx, in, dim, isAscending);  break;
             default:  TYPE_ERROR(1, type);
@@ -144,6 +152,10 @@ void sort_by_key_tmplt(af_array *okey, af_array *oval, const af_array ikey, cons
     case f64: sort_by_key<Tk, double >(okey, oval, ikey, ival, dim, isAscending);  break;
     case s32: sort_by_key<Tk, int    >(okey, oval, ikey, ival, dim, isAscending);  break;
     case u32: sort_by_key<Tk, uint   >(okey, oval, ikey, ival, dim, isAscending);  break;
+    case s16: sort_by_key<Tk, short  >(okey, oval, ikey, ival, dim, isAscending);  break;
+    case u16: sort_by_key<Tk, ushort >(okey, oval, ikey, ival, dim, isAscending);  break;
+    case s64: sort_by_key<Tk, intl   >(okey, oval, ikey, ival, dim, isAscending);  break;
+    case u64: sort_by_key<Tk, uintl  >(okey, oval, ikey, ival, dim, isAscending);  break;
     case u8:  sort_by_key<Tk, uchar  >(okey, oval, ikey, ival, dim, isAscending);  break;
     case b8:  sort_by_key<Tk, char   >(okey, oval, ikey, ival, dim, isAscending);  break;
     default:  TYPE_ERROR(1, vtype);
@@ -175,6 +187,10 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values,
             case f64: sort_by_key_tmplt<double >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             case s32: sort_by_key_tmplt<int    >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             case u32: sort_by_key_tmplt<uint   >(&oKey, &oVal, keys, values, dim, isAscending);  break;
+            case s16: sort_by_key_tmplt<short  >(&oKey, &oVal, keys, values, dim, isAscending);  break;
+            case u16: sort_by_key_tmplt<ushort >(&oKey, &oVal, keys, values, dim, isAscending);  break;
+            case s64: sort_by_key_tmplt<intl   >(&oKey, &oVal, keys, values, dim, isAscending);  break;
+            case u64: sort_by_key_tmplt<uintl  >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             case u8:  sort_by_key_tmplt<uchar  >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             case b8:  sort_by_key_tmplt<char   >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             default:  TYPE_ERROR(1, type);
diff --git a/src/api/c/stats.h b/src/api/c/stats.h
index 0e74942880..56439d507a 100644
--- a/src/api/c/stats.h
+++ b/src/api/c/stats.h
@@ -40,39 +40,52 @@ struct baseOutType {
                                 float>::type type;
 };
 
-template<typename T>
-inline T mean(const Array<T>& in)
+template<typename Ti, typename To>
+inline To mean(const Array<Ti>& in)
 {
-    T out = reduce_all<af_add_t, T, T>(in);
-    T result = division(out, in.elements());
+    To out    = reduce_all<af_add_t, Ti, To>(in);
+    To result = division(out, in.elements());
     return result;
 }
 
-template<typename T, typename wType>
-inline T mean(const Array<T>& in, const Array<wType>& weights)
+template<typename T, typename Tw>
+static T mean(const Array<T>& input, const Array<Tw>& weights)
 {
-    Array<T> wts   = cast<T>(weights);
-
-    dim4 iDims = in.dims();
+    dim4 iDims = input.dims();
 
-    Array<T> wtdInput = arithOp<T, af_mul_t>(in, wts, iDims);
+    Array<T> wtdInput = arithOp<T, af_mul_t>(input, weights, iDims);
 
     T wtdSum = reduce_all<af_add_t, T, T>(wtdInput);
-    wType wtsSum = reduce_all<af_add_t, wType, wType>(weights);
+    T wtsSum = reduce_all<af_add_t, T, T>(weights);
 
     return division(wtdSum, wtsSum);
 }
 
-template<typename T>
-inline Array<T> mean(const Array<T>& in, dim_t dim)
+#define COMPLEX_TYPE_SPECILIZATION(T, Tw) \
+template<>\
+STATIC_ T mean<T, Tw>(const Array<T>& input, const Array<Tw>& weights)\
+{\
+    Array<T> wts = cast<T, Tw>(weights);\
+    dim4 iDims   = input.dims();\
+    Array<T> wtdInput = arithOp<T, af_mul_t>(input, wts, iDims);\
+    T wtdSum  = reduce_all<af_add_t, T, T>(wtdInput);\
+    Tw wtsSum = reduce_all<af_add_t, Tw, Tw>(weights);\
+    return division(wtdSum, wtsSum);\
+}
+
+COMPLEX_TYPE_SPECILIZATION(cfloat, float)
+COMPLEX_TYPE_SPECILIZATION(cdouble, double)
+
+template<typename Ti, typename To>
+inline Array<To> mean(const Array<Ti>& in, dim_t dim)
 {
-    Array<T> redArr = reduce<af_add_t, T, T>(in, dim);
+    Array<To> redArr = reduce<af_add_t, Ti, To>(in, dim);
 
     dim4 iDims = in.dims();
     dim4 oDims = redArr.dims();
 
-    Array<T> cnstArr = createValueArray<T>(oDims, scalar<T>(iDims[dim]));
-    Array<T> result  = arithOp<T, af_div_t>(redArr, cnstArr, oDims);
+    Array<To> cnstArr = createValueArray<To>(oDims, scalar<To>(iDims[dim]));
+    Array<To> result  = arithOp<To, af_div_t>(redArr, cnstArr, oDims);
 
     return result;
 }
diff --git a/src/api/c/stdev.cpp b/src/api/c/stdev.cpp
index b2f307b628..59c9653bdf 100644
--- a/src/api/c/stdev.cpp
+++ b/src/api/c/stdev.cpp
@@ -28,15 +28,12 @@ using namespace detail;
 template<typename inType, typename outType>
 static outType stdev(const af_array& in)
 {
-    Array<outType> input = cast<outType>(getArray<inType>(in));
-
-    Array<outType> meanCnst= createValueArray<outType>(input.dims(), mean<outType>(input));
-
-    Array<outType> diff    = detail::arithOp<outType, af_sub_t>(input, meanCnst, input.dims());
-
-    Array<outType> diffSq  = detail::arithOp<outType, af_mul_t>(diff, diff, diff.dims());
-
-    outType result = division(reduce_all<af_add_t, outType, outType>(diffSq), input.elements());
+    Array<inType> _in       = getArray<inType>(in);
+    Array<outType> input    = cast<outType>(_in);
+    Array<outType> meanCnst = createValueArray<outType>(input.dims(), mean<inType, outType>(_in));
+    Array<outType> diff     = detail::arithOp<outType, af_sub_t>(input, meanCnst, input.dims());
+    Array<outType> diffSq   = detail::arithOp<outType, af_mul_t>(diff, diff, diff.dims());
+    outType result          = division(reduce_all<af_add_t, outType, outType>(diffSq), input.elements());
 
     return sqrt(result);
 }
@@ -44,10 +41,11 @@ static outType stdev(const af_array& in)
 template<typename inType, typename outType>
 static af_array stdev(const af_array& in, int dim)
 {
-    Array<outType> input = cast<outType>(getArray<inType>(in));
+    Array<inType> _in    = getArray<inType>(in);
+    Array<outType> input = cast<outType>(_in);
     dim4 iDims = input.dims();
 
-    Array<outType> meanArr = mean<outType>(input, dim);
+    Array<outType> meanArr = mean<inType, outType>(_in, dim);
 
     /* now tile meanArr along dim and use it for variance computation */
     dim4 tileDims(1);
@@ -77,6 +75,8 @@ af_err af_stdev_all(double *realVal, double *imagVal, const af_array in)
             case f32: *realVal = stdev<float , float >(in); break;
             case s32: *realVal = stdev<int   , float >(in); break;
             case u32: *realVal = stdev<uint  , float >(in); break;
+            case s16: *realVal = stdev<short , float >(in); break;
+            case u16: *realVal = stdev<ushort, float >(in); break;
             case s64: *realVal = stdev<intl  , double>(in); break;
             case u64: *realVal = stdev<uintl , double>(in); break;
             case  u8: *realVal = stdev<uchar , float >(in); break;
@@ -112,6 +112,8 @@ af_err af_stdev(af_array *out, const af_array in, const dim_t dim)
             case f32: output = stdev<float ,  float >(in, dim); break;
             case s32: output = stdev<int   ,  float >(in, dim); break;
             case u32: output = stdev<uint  ,  float >(in, dim); break;
+            case s16: output = stdev<short ,  float >(in, dim); break;
+            case u16: output = stdev<ushort,  float >(in, dim); break;
             case s64: output = stdev<intl  ,  double>(in, dim); break;
             case u64: output = stdev<uintl ,  double>(in, dim); break;
             case  u8: output = stdev<uchar ,  float >(in, dim); break;
diff --git a/src/api/c/stream.cpp b/src/api/c/stream.cpp
index 1161703d0a..a7b5771ee0 100644
--- a/src/api/c/stream.cpp
+++ b/src/api/c/stream.cpp
@@ -133,6 +133,8 @@ af_err af_save_array(int *index, const char *key, const af_array arr, const char
             case u8:    id = save<uchar>   (key, arr, filename, append);   break;
             case s64:   id = save<intl>    (key, arr, filename, append);   break;
             case u64:   id = save<uintl>   (key, arr, filename, append);   break;
+            case s16:   id = save<short>   (key, arr, filename, append);   break;
+            case u16:   id = save<ushort>  (key, arr, filename, append);   break;
             default:    TYPE_ERROR(1, type);
         }
         std::swap(*index, id);
@@ -234,6 +236,8 @@ static af_array readArrayV1(const char *filename, const unsigned index)
         case u8  : out = readDataToArray<uchar>  (fs);  break;
         case s64 : out = readDataToArray<intl>   (fs);  break;
         case u64 : out = readDataToArray<uintl>  (fs);  break;
+        case s16 : out = readDataToArray<short>  (fs);  break;
+        case u16 : out = readDataToArray<ushort> (fs);  break;
         default:    TYPE_ERROR(1, type);
     }
     fs.close();
diff --git a/src/api/c/surface.cpp b/src/api/c/surface.cpp
new file mode 100644
index 0000000000..835849d15a
--- /dev/null
+++ b/src/api/c/surface.cpp
@@ -0,0 +1,135 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/graphics.h>
+#include <af/image.h>
+
+#include <ArrayInfo.hpp>
+#include <graphics_common.hpp>
+#include <err_common.hpp>
+#include <backend.hpp>
+#include <surface.hpp>
+#include <reduce.hpp>
+#include <join.hpp>
+#include <tile.hpp>
+#include <reorder.hpp>
+#include <handle.hpp>
+
+using af::dim4;
+using namespace detail;
+
+#if defined(WITH_GRAPHICS)
+using namespace graphics;
+
+template<typename T>
+fg::Surface* setup_surface(const af_array xVals, const af_array yVals, const af_array zVals)
+{
+    Array<T> xIn = getArray<T>(xVals);
+    Array<T> yIn = getArray<T>(yVals);
+    Array<T> zIn = getArray<T>(zVals);
+
+    T xmax = reduce_all<af_max_t, T, T>(xIn);
+    T xmin = reduce_all<af_min_t, T, T>(xIn);
+    T ymax = reduce_all<af_max_t, T, T>(yIn);
+    T ymin = reduce_all<af_min_t, T, T>(yIn);
+    T zmax = reduce_all<af_max_t, T, T>(zIn);
+    T zmin = reduce_all<af_min_t, T, T>(zIn);
+
+    ArrayInfo Xinfo = getInfo(xVals);
+    ArrayInfo Yinfo = getInfo(yVals);
+    ArrayInfo Zinfo = getInfo(zVals);
+
+    af::dim4 X_dims = Xinfo.dims();
+    af::dim4 Y_dims = Yinfo.dims();
+    af::dim4 Z_dims = Zinfo.dims();
+
+    dim4   rdims(1, 0, 2, 3);
+    dim4 x_tdims(1, Y_dims[0], 1, 1);
+    dim4 y_tdims(1, X_dims[0], 1, 1);
+    if(Xinfo.isVector()){
+        xIn = tile(xIn, x_tdims);
+        yIn = tile(yIn, y_tdims);
+        yIn = reorder(yIn, rdims);
+    }
+
+    xIn.modDims(xIn.elements());
+    yIn.modDims(yIn.elements());
+    zIn.modDims(zIn.elements());
+    Array<T> Z = join(1, join(1, xIn, yIn), zIn);
+    Z = reorder(Z, rdims);
+    Z.modDims(Z.elements());
+
+    ForgeManager& fgMngr = ForgeManager::getInstance();
+    fg::Surface* surface = fgMngr.getSurface(Z_dims[0], Z_dims[1], getGLType<T>());
+    surface->setColor(1.0, 0.0, 0.0);
+    surface->setAxesLimits(xmax, xmin, ymax, ymin, zmax, zmin);
+    surface->setAxesTitles("X Axis", "Y Axis", "Z Axis");
+
+    copy_surface<T>(Z, surface);
+
+    return surface;
+}
+#endif
+
+af_err af_draw_surface(const af_window wind, const af_array xVals, const af_array yVals, const af_array S, const af_cell* const props)
+{
+#if defined(WITH_GRAPHICS)
+    if(wind==0) {
+        std::cerr<<"Not a valid window"<<std::endl;
+        return AF_SUCCESS;
+    }
+
+    try {
+        ArrayInfo Xinfo = getInfo(xVals);
+        af::dim4 X_dims = Xinfo.dims();
+        af_dtype Xtype  = Xinfo.getType();
+
+        ArrayInfo Yinfo = getInfo(yVals);
+        af::dim4 Y_dims = Yinfo.dims();
+        af_dtype Ytype  = Yinfo.getType();
+
+        ArrayInfo Sinfo = getInfo(S);
+        af::dim4 S_dims = Sinfo.dims();
+        af_dtype Stype  = Sinfo.getType();
+
+        TYPE_ASSERT(Xtype == Ytype);
+        TYPE_ASSERT(Ytype == Stype);
+
+        if(!Yinfo.isVector()){
+            DIM_ASSERT(1, X_dims == Y_dims);
+            DIM_ASSERT(3, Y_dims == S_dims);
+        }else{
+            DIM_ASSERT(3, ( X_dims[0] * Y_dims[0] == (dim_t)Sinfo.elements()));
+        }
+
+        fg::Window* window = reinterpret_cast<fg::Window*>(wind);
+        window->makeCurrent();
+        fg::Surface* surface = NULL;
+
+        switch(Xtype) {
+            case f32: surface = setup_surface<float  >(xVals, yVals , S); break;
+            case s32: surface = setup_surface<int    >(xVals, yVals , S); break;
+            case u32: surface = setup_surface<uint   >(xVals, yVals , S); break;
+            case s16: surface = setup_surface<short  >(xVals, yVals , S); break;
+            case u16: surface = setup_surface<ushort >(xVals, yVals , S); break;
+            case u8 : surface = setup_surface<uchar  >(xVals, yVals , S); break;
+            default:  TYPE_ERROR(1, Xtype);
+        }
+
+        if (props->col>-1 && props->row>-1)
+            window->draw(props->col, props->row, *surface, props->title);
+        else
+            window->draw(*surface);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+#else
+    return AF_ERR_NO_GFX;
+#endif
+}
diff --git a/src/api/c/susan.cpp b/src/api/c/susan.cpp
index e070df870b..75c295388e 100644
--- a/src/api/c/susan.cpp
+++ b/src/api/c/susan.cpp
@@ -34,14 +34,11 @@ static af_features susan(af_array const &in,
                       getArray<T>(in), radius, diff_thr, geom_thr,
                       feature_ratio, edge);
 
-    Array<float> orientation = createValueArray<float>(feat.n, 0.0);
-    Array<float> size = createValueArray<float>(feat.n, 1.0);
-
     feat.x           = getHandle(x);
     feat.y           = getHandle(y);
     feat.score       = getHandle(score);
-    feat.orientation = getHandle(orientation);
-    feat.size        = getHandle(size);
+    feat.orientation = getHandle(feat.n > 0 ? createValueArray<float>(feat.n, 0.0) : createEmptyArray<float>(dim4()));
+    feat.size        = getHandle(feat.n > 0 ? createValueArray<float>(feat.n, 1.0) : createEmptyArray<float>(dim4()));
 
     return getFeaturesHandle(feat);
 }
@@ -69,6 +66,8 @@ af_err af_susan(af_features* out, const af_array in,
             case b8 : *out = susan<char  >(in, radius, diff_thr, geom_thr, feature_ratio, edge); break;
             case s32: *out = susan<int   >(in, radius, diff_thr, geom_thr, feature_ratio, edge); break;
             case u32: *out = susan<uint  >(in, radius, diff_thr, geom_thr, feature_ratio, edge); break;
+            case s16: *out = susan<short >(in, radius, diff_thr, geom_thr, feature_ratio, edge); break;
+            case u16: *out = susan<ushort>(in, radius, diff_thr, geom_thr, feature_ratio, edge); break;
             case u8 : *out = susan<uchar >(in, radius, diff_thr, geom_thr, feature_ratio, edge); break;
             default : TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/tile.cpp b/src/api/c/tile.cpp
index 7d546c2ead..f722f89892 100644
--- a/src/api/c/tile.cpp
+++ b/src/api/c/tile.cpp
@@ -70,6 +70,8 @@ af_err af_tile(af_array *out, const af_array in, const af::dim4 &tileDims)
             case u32: output = tile<uint   >(in, tileDims);  break;
             case s64: output = tile<intl   >(in, tileDims);  break;
             case u64: output = tile<uintl  >(in, tileDims);  break;
+            case s16: output = tile<short  >(in, tileDims);  break;
+            case u16: output = tile<ushort >(in, tileDims);  break;
             case u8:  output = tile<uchar  >(in, tileDims);  break;
             default:  TYPE_ERROR(1, type);
         }
diff --git a/src/api/c/transform.cpp b/src/api/c/transform.cpp
index c24c9f7793..bacb008c78 100644
--- a/src/api/c/transform.cpp
+++ b/src/api/c/transform.cpp
@@ -63,6 +63,8 @@ af_err af_transform(af_array *out, const af_array in, const af_array tf,
             case u32: output = transform<uint   >(in, tf, odims, method, inverse);  break;
             case s64: output = transform<intl   >(in, tf, odims, method, inverse);  break;
             case u64: output = transform<uintl  >(in, tf, odims, method, inverse);  break;
+            case s16: output = transform<short  >(in, tf, odims, method, inverse);  break;
+            case u16: output = transform<ushort >(in, tf, odims, method, inverse);  break;
             case u8:  output = transform<uchar  >(in, tf, odims, method, inverse);  break;
             case b8:  output = transform<char   >(in, tf, odims, method, inverse);  break;
             default:  TYPE_ERROR(1, itype);
diff --git a/src/api/c/transpose.cpp b/src/api/c/transpose.cpp
index eb89695926..1418c290c4 100644
--- a/src/api/c/transpose.cpp
+++ b/src/api/c/transpose.cpp
@@ -61,6 +61,8 @@ af_err af_transpose(af_array *out, af_array in, const bool conjugate)
             case u8 : output = trs<uchar>  (in, conjugate);    break;
             case s64: output = trs<intl>   (in, conjugate);    break;
             case u64: output = trs<uintl>  (in, conjugate);    break;
+            case s16: output = trs<short>  (in, conjugate);    break;
+            case u16: output = trs<ushort> (in, conjugate);    break;
             default : TYPE_ERROR(1, type);
         }
         std::swap(*out,output);
@@ -101,6 +103,8 @@ af_err af_transpose_inplace(af_array in, const bool conjugate)
             case u8 : transpose_inplace<uchar>  (in, conjugate);    break;
             case s64: transpose_inplace<intl>   (in, conjugate);    break;
             case u64: transpose_inplace<uintl>  (in, conjugate);    break;
+            case s16: transpose_inplace<short>  (in, conjugate);    break;
+            case u16: transpose_inplace<ushort> (in, conjugate);    break;
             default : TYPE_ERROR(1, type);
         }
     }
diff --git a/src/api/c/type_util.cpp b/src/api/c/type_util.cpp
index 750932c9cd..39a9af60d7 100644
--- a/src/api/c/type_util.cpp
+++ b/src/api/c/type_util.cpp
@@ -18,8 +18,12 @@ const char *getName(af_dtype type)
     case c64: return "complex double";
     case u32: return "unsigned int";
     case s32: return "int";
-    case u8: return "unsigned char";
-    case b8: return "bool";
-    default: return "unknown type";
+    case u16: return "unsigned short";
+    case s16: return "short";
+    case u64: return "unsigned long long";
+    case s64: return "long long";
+    case u8 : return "unsigned char";
+    case b8 : return "bool";
+    default : return "unknown type";
     }
 }
diff --git a/src/api/c/unary.cpp b/src/api/c/unary.cpp
index 3970128305..a92df7b06d 100644
--- a/src/api/c/unary.cpp
+++ b/src/api/c/unary.cpp
@@ -20,6 +20,7 @@
 #include <unary.hpp>
 #include <implicit.hpp>
 #include <complex.hpp>
+#include <logic.hpp>
 #include <cast.hpp>
 #include <arith.hpp>
 
@@ -244,23 +245,59 @@ static inline af_array checkOp(const af_array in)
     return res;
 }
 
+template<af_op_t op>
+struct cplxLogicOp
+{
+    af_array operator()(Array<char> resR, Array<char> resI, dim4 dims)
+    {
+        return getHandle(logicOp<char, af_or_t>(resR, resI, dims));
+    }
+};
+
+template <>
+struct cplxLogicOp<af_iszero_t>
+{
+    af_array operator()(Array<char> resR, Array<char> resI, dim4 dims)
+    {
+        return getHandle(logicOp<char, af_and_t>(resR, resI, dims));
+    }
+};
+
+template<typename T, typename BT, af_op_t op>
+static inline af_array checkOpCplx(const af_array in)
+{
+    Array<BT> R = real<BT, T>(getArray<T>(in));
+    Array<BT> I = imag<BT, T>(getArray<T>(in));
+
+    Array<char> resR = checkOp<BT, op>(R);
+    Array<char> resI = checkOp<BT, op>(I);
+
+    ArrayInfo in_info = getInfo(in);
+    dim4 dims = in_info.dims();
+    cplxLogicOp<op> cplxLogic;
+    af_array res = cplxLogic(resR, resI, dims);
+
+    return res;
+}
+
 template<af_op_t op>
 static af_err af_check(af_array *out, const af_array in)
 {
     try {
 
         ArrayInfo in_info = getInfo(in);
-        ARG_ASSERT(1, in_info.isReal());
 
         af_dtype in_type = in_info.getType();
         af_array res;
 
-        // Convert all inputs to floats / doubles
+        // Convert all inputs to floats / doubles / complex
         af_dtype type = implicit(in_type, f32);
 
         switch (type) {
         case f32 : res = checkOp<float  , op>(in); break;
         case f64 : res = checkOp<double , op>(in); break;
+        case c32 : res = checkOpCplx<cfloat , float , op>(in); break;
+        case c64 : res = checkOpCplx<cdouble, double, op>(in); break;
         default:
             TYPE_ERROR(1, in_type); break;
         }
diff --git a/src/api/c/unwrap.cpp b/src/api/c/unwrap.cpp
index 2e80d94595..25b4a67bed 100644
--- a/src/api/c/unwrap.cpp
+++ b/src/api/c/unwrap.cpp
@@ -52,6 +52,8 @@ af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t w
             case u32: output = unwrap<uint   >(in, wx, wy, sx, sy, px, py, is_column);  break;
             case s64: output = unwrap<intl   >(in, wx, wy, sx, sy, px, py, is_column);  break;
             case u64: output = unwrap<uintl  >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case s16: output = unwrap<short  >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case u16: output = unwrap<ushort >(in, wx, wy, sx, sy, px, py, is_column);  break;
             case u8:  output = unwrap<uchar  >(in, wx, wy, sx, sy, px, py, is_column);  break;
             case b8:  output = unwrap<char   >(in, wx, wy, sx, sy, px, py, is_column);  break;
             default:  TYPE_ERROR(1, type);
diff --git a/src/api/c/util.cpp b/src/api/c/util.cpp
new file mode 100644
index 0000000000..cc9a07ac4f
--- /dev/null
+++ b/src/api/c/util.cpp
@@ -0,0 +1,81 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/index.h>
+// The following should be included using double quotes
+// to enable it's use in unified wrapper
+#include "err_common.hpp"
+
+af_seq af_make_seq(double begin, double end, double step)
+{
+    af_seq seq = {begin, end, step};
+    return seq;
+}
+
+af_err af_create_indexers(af_index_t** indexers)
+{
+    try {
+        af_index_t* out = new af_index_t[4];
+        std::swap(*indexers, out);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+}
+
+af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim)
+{
+    ARG_ASSERT(0, (indexer!=NULL));
+    ARG_ASSERT(1, (idx!=NULL));
+    ARG_ASSERT(2, (dim>=0 && dim<=3));
+    try {
+        indexer[dim].idx.arr = idx;
+        indexer[dim].isBatch = false;
+        indexer[dim].isSeq   = false;
+    }
+    CATCHALL
+        return AF_SUCCESS;
+}
+
+af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch)
+{
+    ARG_ASSERT(0, (indexer!=NULL));
+    ARG_ASSERT(1, (idx!=NULL));
+    ARG_ASSERT(2, (dim>=0 && dim<=3));
+    try {
+        indexer[dim].idx.seq = *idx;
+        indexer[dim].isBatch = is_batch;
+        indexer[dim].isSeq   = true;
+    }
+    CATCHALL
+        return AF_SUCCESS;
+}
+
+af_err af_set_seq_param_indexer(af_index_t* indexer,
+                              const double begin, const double end, const double step,
+                              const dim_t dim, const bool is_batch)
+{
+    ARG_ASSERT(0, (indexer!=NULL));
+    ARG_ASSERT(4, (dim>=0 && dim<=3));
+    try {
+        indexer[dim].idx.seq = af_make_seq(begin, end, step);
+        indexer[dim].isBatch = is_batch;
+        indexer[dim].isSeq   = true;
+    }
+    CATCHALL
+        return AF_SUCCESS;
+}
+
+af_err af_release_indexers(af_index_t* indexers)
+{
+    try {
+        delete[] indexers;
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+}
diff --git a/src/api/c/var.cpp b/src/api/c/var.cpp
index 7feb1c4692..59a651b4af 100644
--- a/src/api/c/var.cpp
+++ b/src/api/c/var.cpp
@@ -26,9 +26,10 @@ using namespace detail;
 template<typename inType, typename outType>
 static outType varAll(const af_array& in, const bool isbiased)
 {
-    Array<outType> input = cast<outType>(getArray<inType>(in));
+    Array<inType> inArr = getArray<inType>(in);
+    Array<outType> input = cast<outType>(inArr);
 
-    Array<outType> meanCnst= createValueArray<outType>(input.dims(), mean<outType>(input));
+    Array<outType> meanCnst= createValueArray<outType>(input.dims(), mean<inType, outType>(inArr));
 
     Array<outType> diff    = arithOp<outType, af_sub_t>(input, meanCnst, input.dims());
 
@@ -65,10 +66,11 @@ static outType varAll(const af_array& in, const af_array weights)
 template<typename inType, typename outType>
 static af_array var(const af_array& in, const bool isbiased, int dim)
 {
-    Array<outType> input = cast<outType>(getArray<inType>(in));
+    Array<inType> _in    = getArray<inType>(in);
+    Array<outType> input = cast<outType>(_in);
     dim4 iDims = input.dims();
 
-    Array<outType> meanArr = mean<outType>(input, dim);
+    Array<outType> meanArr = mean<inType, outType>(_in, dim);
 
     /* now tile meanArr along dim and use it for variance computation */
     dim4 tileDims(1);
@@ -127,6 +129,8 @@ af_err af_var(af_array *out, const af_array in, const bool isbiased, const dim_t
             case f32: output = var<float ,  float >(in, isbiased, dim); break;
             case s32: output = var<int   ,  float >(in, isbiased, dim); break;
             case u32: output = var<uint  ,  float >(in, isbiased, dim); break;
+            case s16: output = var<short ,  float >(in, isbiased, dim); break;
+            case u16: output = var<ushort,  float >(in, isbiased, dim); break;
             case s64: output = var<intl  ,  double>(in, isbiased, dim); break;
             case u64: output = var<uintl ,  double>(in, isbiased, dim); break;
             case  u8: output = var<uchar ,  float >(in, isbiased, dim); break;
@@ -159,6 +163,8 @@ af_err af_var_weighted(af_array *out, const af_array in, const af_array weights,
             case f32: output = var<float ,  float >(in, weights, dim); break;
             case s32: output = var<int   ,  float >(in, weights, dim); break;
             case u32: output = var<uint  ,  float >(in, weights, dim); break;
+            case s16: output = var<short ,  float >(in, weights, dim); break;
+            case u16: output = var<ushort,  float >(in, weights, dim); break;
             case s64: output = var<intl  ,  double>(in, weights, dim); break;
             case u64: output = var<uintl ,  double>(in, weights, dim); break;
             case  u8: output = var<uchar ,  float >(in, weights, dim); break;
@@ -183,6 +189,8 @@ af_err af_var_all(double *realVal, double *imagVal, const af_array in, const boo
             case f32: *realVal = varAll<float , float >(in, isbiased); break;
             case s32: *realVal = varAll<int   , float >(in, isbiased); break;
             case u32: *realVal = varAll<uint  , float >(in, isbiased); break;
+            case s16: *realVal = varAll<short , float >(in, isbiased); break;
+            case u16: *realVal = varAll<ushort, float >(in, isbiased); break;
             case s64: *realVal = varAll<intl  , double>(in, isbiased); break;
             case u64: *realVal = varAll<uintl , double>(in, isbiased); break;
             case  u8: *realVal = varAll<uchar , float >(in, isbiased); break;
@@ -219,6 +227,8 @@ af_err af_var_all_weighted(double *realVal, double *imagVal, const af_array in,
             case f32: *realVal = varAll<float , float >(in, weights); break;
             case s32: *realVal = varAll<int   , float >(in, weights); break;
             case u32: *realVal = varAll<uint  , float >(in, weights); break;
+            case s16: *realVal = varAll<short , float >(in, weights); break;
+            case u16: *realVal = varAll<ushort, float >(in, weights); break;
             case s64: *realVal = varAll<intl  , double >(in, weights); break;
             case u64: *realVal = varAll<uintl , double >(in, weights); break;
             case  u8: *realVal = varAll<uchar , float >(in, weights); break;
diff --git a/src/api/c/where.cpp b/src/api/c/where.cpp
index 0853e6df46..4aad8c4a75 100644
--- a/src/api/c/where.cpp
+++ b/src/api/c/where.cpp
@@ -40,6 +40,8 @@ af_err af_where(af_array *idx, const af_array in)
         case u32: res = where<uint   >(in); break;
         case s64: res = where<intl   >(in); break;
         case u64: res = where<uintl  >(in); break;
+        case s16: res = where<short  >(in); break;
+        case u16: res = where<ushort >(in); break;
         case u8 : res = where<uchar  >(in); break;
         case b8 : res = where<char   >(in); break;
         default:
diff --git a/src/api/c/wrap.cpp b/src/api/c/wrap.cpp
index dc2b54b680..85386b2a6b 100644
--- a/src/api/c/wrap.cpp
+++ b/src/api/c/wrap.cpp
@@ -66,6 +66,8 @@ af_err af_wrap(af_array *out, const af_array in,
             case u32: output = wrap<uint   >(in, ox, oy, wx, wy, sx, sy, px, py, is_column);  break;
             case s64: output = wrap<intl   >(in, ox, oy, wx, wy, sx, sy, px, py, is_column);  break;
             case u64: output = wrap<uintl  >(in, ox, oy, wx, wy, sx, sy, px, py, is_column);  break;
+            case s16: output = wrap<short  >(in, ox, oy, wx, wy, sx, sy, px, py, is_column);  break;
+            case u16: output = wrap<ushort >(in, ox, oy, wx, wy, sx, sy, px, py, is_column);  break;
             case u8:  output = wrap<uchar  >(in, ox, oy, wx, wy, sx, sy, px, py, is_column);  break;
             case b8:  output = wrap<char   >(in, ox, oy, wx, wy, sx, sy, px, py, is_column);  break;
             default:  TYPE_ERROR(1, type);
diff --git a/src/api/cpp/array.cpp b/src/api/cpp/array.cpp
index 3280457e19..208f60ed68 100644
--- a/src/api/cpp/array.cpp
+++ b/src/api/cpp/array.cpp
@@ -84,6 +84,8 @@ namespace af
         case b8 : return sizeof(unsigned char);
         case c32: return sizeof(float) * 2;
         case c64: return sizeof(double) * 2;
+        case s16: return sizeof(short);
+        case u16: return sizeof(unsigned short);
         default: return sizeof(float);
         }
     }
@@ -219,6 +221,8 @@ namespace af
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(unsigned short)
 
 #undef INSTANTIATE
 
@@ -669,15 +673,18 @@ namespace af
     ASSIGN_TYPE(char               , OP)        \
     ASSIGN_TYPE(unsigned char      , OP)        \
     ASSIGN_TYPE(bool               , OP)        \
+    ASSIGN_TYPE(short              , OP)        \
+    ASSIGN_TYPE(unsigned short     , OP)        \
 
     ASSIGN_OP(= , =)
     ASSIGN_OP(+=, +)
     ASSIGN_OP(-=, -)
     ASSIGN_OP(*=, *)
     ASSIGN_OP(/=, /)
-#undef ASSIGN_TYPE
 #undef ASSIGN_OP
 
+#undef ASSIGN_TYPE
+
 #define SELF_OP(OP, op1)                                                          \
     array::array_proxy& array::array_proxy::operator OP(const array_proxy &other) \
     {                                                                             \
@@ -815,6 +822,8 @@ namespace af
     ASSIGN_TYPE(char               , OP)                            \
     ASSIGN_TYPE(unsigned char      , OP)                            \
     ASSIGN_TYPE(bool               , OP)                            \
+    ASSIGN_TYPE(short              , OP)                            \
+    ASSIGN_TYPE(unsigned short     , OP)                            \
 
     ASSIGN_OP(+=, af_add)
     ASSIGN_OP(-=, af_sub)
@@ -822,6 +831,7 @@ namespace af
     ASSIGN_OP(/=, af_div)
 
 #undef ASSIGN_OP
+
 #undef ASSIGN_TYPE
 
 #define ASSIGN_TYPE(TY, OP)                                     \
@@ -847,10 +857,13 @@ namespace af
     ASSIGN_TYPE(char               , OP)        \
     ASSIGN_TYPE(unsigned char      , OP)        \
     ASSIGN_TYPE(bool               , OP)        \
+    ASSIGN_TYPE(short              , OP)        \
+    ASSIGN_TYPE(unsigned short     , OP)        \
 
     ASSIGN_OP(= )
 
 #undef ASSIGN_OP
+
 #undef ASSIGN_TYPE
 
 af::dtype implicit_dtype(af::dtype scalar_type, af::dtype array_type)
@@ -917,6 +930,8 @@ af::dtype implicit_dtype(af::dtype scalar_type, af::dtype array_type)
     BINARY_TYPE(char               , OP, func, b8)              \
     BINARY_TYPE(unsigned char      , OP, func, u8)              \
     BINARY_TYPE(bool               , OP, func, b8)              \
+    BINARY_TYPE(short              , OP, func, s16)             \
+    BINARY_TYPE(unsigned short     , OP, func, u16)             \
 
     BINARY_OP(+, af_add)
     BINARY_OP(-, af_sub)
@@ -937,9 +952,10 @@ af::dtype implicit_dtype(af::dtype scalar_type, af::dtype array_type)
     BINARY_OP(<<, af_bitshiftl)
     BINARY_OP(>>, af_bitshiftr)
 
-#undef BINARY_TYPE
 #undef BINARY_OP
 
+#undef BINARY_TYPE
+
     array array::operator-() const
     {
         af_array lhs = this->get();
@@ -1013,6 +1029,8 @@ af::dtype implicit_dtype(af::dtype scalar_type, af::dtype array_type)
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(unsigned short)
 
 #undef INSTANTIATE
 
@@ -1041,6 +1059,8 @@ af::dtype implicit_dtype(af::dtype scalar_type, af::dtype array_type)
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(unsigned short)
 
 #undef INSTANTIATE
 #undef TEMPLATE_MEM_FUNC
diff --git a/src/api/cpp/corrcoef.cpp b/src/api/cpp/corrcoef.cpp
index 3b8f5cfcdb..ed78a684e3 100644
--- a/src/api/cpp/corrcoef.cpp
+++ b/src/api/cpp/corrcoef.cpp
@@ -28,6 +28,10 @@ INSTANTIATE_CORRCOEF(int);
 INSTANTIATE_CORRCOEF(unsigned int);
 INSTANTIATE_CORRCOEF(char);
 INSTANTIATE_CORRCOEF(unsigned char);
+INSTANTIATE_CORRCOEF(intl);
+INSTANTIATE_CORRCOEF(uintl);
+INSTANTIATE_CORRCOEF(short);
+INSTANTIATE_CORRCOEF(unsigned short);
 
 #undef INSTANTIATE_CORRCOEF
 
diff --git a/src/api/cpp/data.cpp b/src/api/cpp/data.cpp
index 196fbf812b..3b7854a20b 100644
--- a/src/api/cpp/data.cpp
+++ b/src/api/cpp/data.cpp
@@ -117,6 +117,8 @@ namespace af
     CONSTANT(long long);
     CONSTANT(unsigned long long);
     CONSTANT(bool);
+    CONSTANT(short);
+    CONSTANT(unsigned short);
 
 #undef CONSTANT
 
diff --git a/src/api/cpp/device.cpp b/src/api/cpp/device.cpp
index 0a39ed2bae..bec0a60d59 100644
--- a/src/api/cpp/device.cpp
+++ b/src/api/cpp/device.cpp
@@ -7,13 +7,41 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#include <af/array.h>
 #include <af/device.h>
 #include <af/compatible.h>
 #include <af/traits.hpp>
+#include <af/backend.h>
 #include "error.hpp"
 
 namespace af
 {
+    void setBackend(const Backend bknd)
+    {
+        AF_THROW(af_set_backend(bknd));
+    }
+
+    unsigned getBackendCount()
+    {
+        unsigned temp = 1;
+        AF_THROW(af_get_backend_count(&temp));
+        return temp;
+    }
+
+    int getAvailableBackends()
+    {
+        int result = 0;
+        AF_THROW(af_get_available_backends(&result));
+        return result;
+    }
+
+    af::Backend getBackendId(const array &in)
+    {
+        af::Backend result = (af::Backend)0;
+        AF_THROW(af_get_backend_id(&result, in.get()));
+        return result;
+    }
+
     void info()
     {
         AF_THROW(af_info());
@@ -78,6 +106,8 @@ namespace af
         case b8 : return sizeof(unsigned char);
         case c32: return sizeof(float) * 2;
         case c64: return sizeof(double) * 2;
+        case s16: return sizeof(short);
+        case u16: return sizeof(unsigned short);
         default: return sizeof(float);
         }
     }
@@ -154,5 +184,7 @@ namespace af
     INSTANTIATE(unsigned)
     INSTANTIATE(unsigned char)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(unsigned short)
 
 }
diff --git a/src/api/cpp/features.cpp b/src/api/cpp/features.cpp
index be9e160028..9cf23699b0 100644
--- a/src/api/cpp/features.cpp
+++ b/src/api/cpp/features.cpp
@@ -9,7 +9,6 @@
 
 #include <af/features.h>
 #include <af/array.h>
-#include <handle.hpp>
 #include "error.hpp"
 
 namespace af
diff --git a/src/api/cpp/graphics.cpp b/src/api/cpp/graphics.cpp
index 1272d2f67c..b7480195dc 100644
--- a/src/api/cpp/graphics.cpp
+++ b/src/api/cpp/graphics.cpp
@@ -79,12 +79,33 @@ void Window::plot(const array& X, const array& Y, const char* const title)
     AF_THROW(af_draw_plot(get(), X.get(), Y.get(), &temp));
 }
 
+void Window::plot3(const array& P, const char* const title)
+{
+    af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT};
+    P.eval();
+    AF_THROW(af_draw_plot3(get(), P.get(), &temp));
+}
+
 void Window::hist(const array& X, const double minval, const double maxval, const char* const title)
 {
     af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT};
     AF_THROW(af_draw_hist(get(), X.get(), minval, maxval, &temp));
 }
 
+void Window::surface(const array& S, const char* const title){
+    //TODO: fix offset on forge?
+    af::array xVals = seq(0, S.dims(0)-1);
+    af::array yVals = seq(0, S.dims(1)-1);
+    af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT};
+    AF_THROW(af_draw_surface(get(), xVals.get(), yVals.get(), S.get(), &temp));
+}
+
+void Window::surface(const array& xVals, const array& yVals, const array& S, const char* const title)
+{
+    af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT};
+    AF_THROW(af_draw_surface(get(), xVals.get(), yVals.get(), S.get(), &temp));
+}
+
 void Window::grid(const int rows, const int cols)
 {
     AF_THROW(af_grid(get(), rows, cols));
diff --git a/src/api/cpp/homography.cpp b/src/api/cpp/homography.cpp
new file mode 100644
index 0000000000..77791047b4
--- /dev/null
+++ b/src/api/cpp/homography.cpp
@@ -0,0 +1,32 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/vision.h>
+#include <af/array.h>
+#include "error.hpp"
+
+namespace af
+{
+
+void homography(array &H, int &inliers,
+                const array &x_src, const array &y_src,
+                const array &x_dst, const array &y_dst,
+                const af_homography_type htype, const float inlier_thr,
+                const unsigned iterations, const af::dtype otype)
+{
+    af_array outH;
+    AF_THROW(af_homography(&outH, &inliers,
+                           x_src.get(), y_src.get(),
+                           x_dst.get(), y_dst.get(),
+                           htype, inlier_thr, iterations, otype));
+
+    H = array(outH);
+}
+
+}
diff --git a/src/api/cpp/imageio.cpp b/src/api/cpp/imageio.cpp
index 7a8087163a..e70b26d1d2 100644
--- a/src/api/cpp/imageio.cpp
+++ b/src/api/cpp/imageio.cpp
@@ -56,4 +56,16 @@ void deleteImageMem(void* ptr)
     AF_THROW(af_delete_image_memory(ptr));
 }
 
+array loadImageNative(const char* filename)
+{
+    af_array out = 0;
+    AF_THROW(af_load_image_native(&out, filename));
+    return array(out);
+}
+
+void saveImageNative(const char* filename, const array& in)
+{
+    AF_THROW(af_save_image_native(filename, in.get()));
+}
+
 }
diff --git a/src/api/cpp/mean.cpp b/src/api/cpp/mean.cpp
index 877ca16d30..980a0d1ba3 100644
--- a/src/api/cpp/mean.cpp
+++ b/src/api/cpp/mean.cpp
@@ -80,6 +80,10 @@ INSTANTIATE_MEAN(int);
 INSTANTIATE_MEAN(unsigned int);
 INSTANTIATE_MEAN(char);
 INSTANTIATE_MEAN(unsigned char);
+INSTANTIATE_MEAN(long long);
+INSTANTIATE_MEAN(unsigned long long);
+INSTANTIATE_MEAN(short);
+INSTANTIATE_MEAN(unsigned short);
 
 #undef INSTANTIATE_MEAN
 
diff --git a/src/api/cpp/median.cpp b/src/api/cpp/median.cpp
index 0528b5ba6d..d047d78a0f 100644
--- a/src/api/cpp/median.cpp
+++ b/src/api/cpp/median.cpp
@@ -29,10 +29,14 @@ INSTANTIATE_MEDIAN(int);
 INSTANTIATE_MEDIAN(unsigned int);
 INSTANTIATE_MEDIAN(char);
 INSTANTIATE_MEDIAN(unsigned char);
+INSTANTIATE_MEDIAN(long long);
+INSTANTIATE_MEDIAN(unsigned long long);
+INSTANTIATE_MEDIAN(short);
+INSTANTIATE_MEDIAN(unsigned short);
 
 #undef INSTANTIATE_MEDIAN
 
-AFAPI array median(const array& in, const dim_t dim)
+array median(const array& in, const dim_t dim)
 {
     af_array temp = 0;
     AF_THROW(af_median(&temp, in.get(), getFNSD(dim, in.dims())));
diff --git a/src/api/cpp/reduce.cpp b/src/api/cpp/reduce.cpp
index d492ef0543..18c12ee63d 100644
--- a/src/api/cpp/reduce.cpp
+++ b/src/api/cpp/reduce.cpp
@@ -115,6 +115,8 @@ namespace af
     INSTANTIATE_REAL(fnC, fnCPP, unsigned long)         \
     INSTANTIATE_REAL(fnC, fnCPP, long long)             \
     INSTANTIATE_REAL(fnC, fnCPP, unsigned long long)    \
+    INSTANTIATE_REAL(fnC, fnCPP, short)                 \
+    INSTANTIATE_REAL(fnC, fnCPP, unsigned short)        \
     INSTANTIATE_REAL(fnC, fnCPP, char)                  \
     INSTANTIATE_REAL(fnC, fnCPP, unsigned char)         \
     INSTANTIATE_CPLX(fnC, fnCPP, af_cfloat, float)      \
@@ -201,6 +203,8 @@ INSTANTIATE(product_nan, product)
     INSTANTIATE_COMPAT(fnCPP, fnCompat, unsigned char)          \
     INSTANTIATE_COMPAT(fnCPP, fnCompat, af_cfloat)              \
     INSTANTIATE_COMPAT(fnCPP, fnCompat, af_cdouble)             \
+    INSTANTIATE_COMPAT(fnCPP, fnCompat, short)                  \
+    INSTANTIATE_COMPAT(fnCPP, fnCompat, unsigned short)         \
 
     INSTANTIATE(product, mul)
     INSTANTIATE(allTrue, alltrue)
@@ -238,6 +242,8 @@ INSTANTIATE(product_nan, product)
     INSTANTIATE_REAL(fn, unsigned)              \
     INSTANTIATE_REAL(fn, char)                  \
     INSTANTIATE_REAL(fn, unsigned char)         \
+    INSTANTIATE_REAL(fn, short)                 \
+    INSTANTIATE_REAL(fn, unsigned short)        \
     INSTANTIATE_CPLX(fn, af_cfloat, float)      \
     INSTANTIATE_CPLX(fn, af_cdouble, double)    \
 
diff --git a/src/api/cpp/seq.cpp b/src/api/cpp/seq.cpp
index a9d5df637e..0ef9326640 100644
--- a/src/api/cpp/seq.cpp
+++ b/src/api/cpp/seq.cpp
@@ -15,8 +15,8 @@
 namespace af
 {
 
-AFAPI int end = -1;
-AFAPI seq span(af_span);
+int end = -1;
+seq span(af_span);
 
 void seq::init(double begin, double end, double step)
 {
diff --git a/src/api/cpp/sift.cpp b/src/api/cpp/sift.cpp
index 74036dce96..8ae3ac6812 100644
--- a/src/api/cpp/sift.cpp
+++ b/src/api/cpp/sift.cpp
@@ -31,4 +31,21 @@ void sift(features& feat, array& desc, const array& in,
     desc = array(temp_desc);
 }
 
+void gloh(features& feat, array& desc, const array& in,
+          const unsigned n_layers, const float contrast_thr,
+          const float edge_thr, const float init_sigma,
+          const bool double_input, const float img_scale,
+          const float feature_ratio)
+{
+    af_features temp_feat;
+    af_array temp_desc = 0;
+    AF_THROW(af_gloh(&temp_feat, &temp_desc, in.get(), n_layers, contrast_thr,
+                     edge_thr, init_sigma, double_input, img_scale, feature_ratio));
+
+    dim_t num = 0;
+    AF_THROW(af_get_features_num(&num, temp_feat));
+    feat = features(temp_feat);
+    desc = array(temp_desc);
+}
+
 }
diff --git a/src/api/cpp/stdev.cpp b/src/api/cpp/stdev.cpp
index b21366a2d8..5a050570a4 100644
--- a/src/api/cpp/stdev.cpp
+++ b/src/api/cpp/stdev.cpp
@@ -42,6 +42,10 @@ INSTANTIATE_STDEV(float);
 INSTANTIATE_STDEV(double);
 INSTANTIATE_STDEV(int);
 INSTANTIATE_STDEV(unsigned int);
+INSTANTIATE_STDEV(intl);
+INSTANTIATE_STDEV(uintl);
+INSTANTIATE_STDEV(short);
+INSTANTIATE_STDEV(unsigned short);
 INSTANTIATE_STDEV(char);
 INSTANTIATE_STDEV(unsigned char);
 
diff --git a/src/api/cpp/timing.cpp b/src/api/cpp/timing.cpp
index f530ba7ef3..2758021beb 100644
--- a/src/api/cpp/timing.cpp
+++ b/src/api/cpp/timing.cpp
@@ -67,20 +67,20 @@ namespace af {
 
 static timer _timer_;
 
-AFAPI timer timer::start()
+timer timer::start()
 {
     return _timer_ = time_now();
 }
-AFAPI double timer::stop(timer start)
+double timer::stop(timer start)
 {
     return time_seconds(start, time_now());
 }
-AFAPI double timer::stop()
+double timer::stop()
 {
     return time_seconds(_timer_, time_now());
 }
 
-AFAPI double timeit(void(*fn)())
+double timeit(void(*fn)())
 {
     // parameters
     int sample_trials = 3;
diff --git a/src/api/cpp/var.cpp b/src/api/cpp/var.cpp
index 224cd9b2e9..bcff1dcf99 100644
--- a/src/api/cpp/var.cpp
+++ b/src/api/cpp/var.cpp
@@ -80,6 +80,8 @@ INSTANTIATE_VAR(int);
 INSTANTIATE_VAR(unsigned int);
 INSTANTIATE_VAR(intl);
 INSTANTIATE_VAR(uintl);
+INSTANTIATE_VAR(short);
+INSTANTIATE_VAR(unsigned short);
 INSTANTIATE_VAR(char);
 INSTANTIATE_VAR(unsigned char);
 
diff --git a/src/api/unified/CMakeLists.txt b/src/api/unified/CMakeLists.txt
new file mode 100644
index 0000000000..179293cabc
--- /dev/null
+++ b/src/api/unified/CMakeLists.txt
@@ -0,0 +1,72 @@
+
+FILE(GLOB unified_headers
+    "*.hpp"
+    "*.h")
+
+FILE(GLOB unified_sources
+    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
+
+SOURCE_GROUP(api\\unified\\Headers FILES ${unified_headers})
+SOURCE_GROUP(api\\unified\\Sources FILES ${unified_sources})
+
+FILE(GLOB cpp_sources
+    "../cpp/*.cpp")
+
+SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources})
+
+FILE(GLOB common_sources
+    "../c/util.cpp"
+    "../c/err_common.cpp"
+    "../c/type_util.cpp"
+    "../../backend/dim4.cpp"
+    )
+
+SOURCE_GROUP(common FILES ${common_sources})
+
+IF(NOT UNIX)
+    ADD_DEFINITIONS(-DAFDLL)
+ENDIF()
+
+# OS Definitions
+IF(UNIX)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread -Wno-comment")
+ELSE(${UNIX}) #Windows
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
+    SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj")
+ENDIF()
+
+ADD_LIBRARY(af SHARED
+            ${unified_headers}
+            ${unified_sources}
+            ${common_sources}
+            ${cpp_sources}
+            )
+
+IF(${BUILD_CPU})
+    ADD_DEPENDENCIES(af afcpu)
+ENDIF()
+
+IF(${BUILD_CUDA})
+    ADD_DEPENDENCIES(af afcuda)
+ENDIF()
+
+IF(${BUILD_OPENCL})
+    ADD_DEPENDENCIES(af afopencl)
+ENDIF()
+
+SET_TARGET_PROPERTIES(af PROPERTIES
+                      VERSION "${AF_VERSION}"
+                      SOVERSION "${AF_VERSION_MAJOR}")
+
+INSTALL(TARGETS af EXPORT AF DESTINATION "${AF_INSTALL_LIB_DIR}"
+        COMPONENT libraries)
+
+IF(APPLE)
+    INSTALL(SCRIPT "${CMAKE_MODULE_PATH}/osx_install/InstallTool.cmake")
+ENDIF(APPLE)
+
+EXPORT(TARGETS af FILE ArrayFireUnified.cmake)
+INSTALL(EXPORT AF DESTINATION "${AF_INSTALL_CMAKE_DIR}"
+        COMPONENT cmake
+        FILE ArrayFireUnified.cmake)
diff --git a/src/api/unified/algorithm.cpp b/src/api/unified/algorithm.cpp
new file mode 100644
index 0000000000..934b7ae2fc
--- /dev/null
+++ b/src/api/unified/algorithm.cpp
@@ -0,0 +1,148 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/algorithm.h>
+#include "symbol_manager.hpp"
+
+#define ALGO_HAPI_DEF(af_func) \
+af_err af_func(af_array* out, const af_array in, const int dim) \
+{ \
+    CHECK_ARRAYS(in); \
+    return CALL(out, in, dim); \
+}
+
+ALGO_HAPI_DEF(af_sum)
+ALGO_HAPI_DEF(af_product)
+ALGO_HAPI_DEF(af_min)
+ALGO_HAPI_DEF(af_max)
+ALGO_HAPI_DEF(af_all_true)
+ALGO_HAPI_DEF(af_any_true)
+ALGO_HAPI_DEF(af_count)
+ALGO_HAPI_DEF(af_accum)
+ALGO_HAPI_DEF(af_diff1)
+ALGO_HAPI_DEF(af_diff2)
+
+#undef ALGO_HAPI_DEF
+
+#define ALGO_HAPI_DEF(af_func_nan) \
+af_err af_func_nan(af_array* out, const af_array in, const int dim, const double nanval) \
+{ \
+    CHECK_ARRAYS(in); \
+    return CALL(out, in, dim, nanval); \
+}
+
+ALGO_HAPI_DEF(af_sum_nan)
+ALGO_HAPI_DEF(af_product_nan)
+
+#undef ALGO_HAPI_DEF
+
+#define ALGO_HAPI_DEF(af_func_all) \
+af_err af_func_all(double *real, double *imag, const af_array in) \
+{ \
+    CHECK_ARRAYS(in); \
+    return CALL(real, imag, in);\
+}
+
+ALGO_HAPI_DEF(af_sum_all)
+ALGO_HAPI_DEF(af_product_all)
+ALGO_HAPI_DEF(af_min_all)
+ALGO_HAPI_DEF(af_max_all)
+ALGO_HAPI_DEF(af_all_true_all)
+ALGO_HAPI_DEF(af_any_true_all)
+ALGO_HAPI_DEF(af_count_all)
+
+#undef ALGO_HAPI_DEF
+
+#define ALGO_HAPI_DEF(af_func_nan_all) \
+af_err af_func_nan_all(double *real, double *imag, const af_array in, const double nanval) \
+{ \
+    CHECK_ARRAYS(in); \
+    return CALL(real, imag, in, nanval);\
+}
+
+ALGO_HAPI_DEF(af_sum_nan_all)
+ALGO_HAPI_DEF(af_product_nan_all)
+
+#undef ALGO_HAPI_DEF
+
+
+#define ALGO_HAPI_DEF(af_ifunc) \
+af_err af_ifunc(af_array* out, af_array *idx, const af_array in, const int dim) \
+{ \
+    CHECK_ARRAYS(in); \
+    return CALL(out, idx, in, dim); \
+}
+
+ALGO_HAPI_DEF(af_imin)
+ALGO_HAPI_DEF(af_imax)
+
+#undef ALGO_HAPI_DEF
+
+#define ALGO_HAPI_DEF(af_ifunc_all) \
+af_err af_ifunc_all(double *real, double *imag, unsigned *idx, const af_array in) \
+{ \
+    CHECK_ARRAYS(in); \
+    return CALL(real, imag, idx, in);\
+}
+
+ALGO_HAPI_DEF(af_imin_all)
+ALGO_HAPI_DEF(af_imax_all)
+
+#undef ALGO_HAPI_DEF
+
+
+af_err af_where(af_array *idx, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(idx, in);
+}
+
+af_err af_sort(af_array *out, const af_array in, const unsigned dim, const bool isAscending)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, dim, isAscending);
+}
+
+af_err af_sort_index(af_array *out, af_array *indices, const af_array in,
+                     const unsigned dim, const bool isAscending)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, indices, in, dim, isAscending);
+}
+
+af_err af_sort_by_key(af_array *out_keys, af_array *out_values,
+                      const af_array keys, const af_array values,
+                      const unsigned dim, const bool isAscending)
+{
+    CHECK_ARRAYS(keys, values);
+    return CALL(out_keys, out_values, keys, values, dim, isAscending);
+}
+
+af_err af_set_unique(af_array *out, const af_array in, const bool is_sorted)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, is_sorted);
+}
+
+af_err af_set_union(af_array *out,
+                    const af_array first, const af_array second,
+                    const bool is_unique)
+{
+    CHECK_ARRAYS(first, second);
+    return CALL(out, first, second, is_unique);
+}
+
+af_err af_set_intersect(af_array *out,
+                        const af_array first, const af_array second,
+                        const bool is_unique)
+{
+    CHECK_ARRAYS(first, second);
+    return CALL(out, first, second, is_unique);
+}
diff --git a/src/api/unified/arith.cpp b/src/api/unified/arith.cpp
new file mode 100644
index 0000000000..c811500773
--- /dev/null
+++ b/src/api/unified/arith.cpp
@@ -0,0 +1,102 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/arith.h>
+#include "symbol_manager.hpp"
+
+#define BINARY_HAPI_DEF(af_func) \
+af_err af_func(af_array* out, const af_array lhs, const af_array rhs, const bool batchMode) \
+{ \
+    CHECK_ARRAYS(lhs, rhs); \
+    return CALL(out, lhs, rhs, batchMode); \
+}
+
+BINARY_HAPI_DEF(af_add)
+BINARY_HAPI_DEF(af_mul)
+BINARY_HAPI_DEF(af_sub)
+BINARY_HAPI_DEF(af_div)
+BINARY_HAPI_DEF(af_maxof)
+BINARY_HAPI_DEF(af_minof)
+BINARY_HAPI_DEF(af_rem)
+BINARY_HAPI_DEF(af_mod)
+BINARY_HAPI_DEF(af_pow)
+BINARY_HAPI_DEF(af_root)
+BINARY_HAPI_DEF(af_atan2)
+BINARY_HAPI_DEF(af_cplx2)
+BINARY_HAPI_DEF(af_eq)
+BINARY_HAPI_DEF(af_neq)
+BINARY_HAPI_DEF(af_gt)
+BINARY_HAPI_DEF(af_ge)
+BINARY_HAPI_DEF(af_lt)
+BINARY_HAPI_DEF(af_le)
+BINARY_HAPI_DEF(af_and)
+BINARY_HAPI_DEF(af_or)
+BINARY_HAPI_DEF(af_bitand)
+BINARY_HAPI_DEF(af_bitor)
+BINARY_HAPI_DEF(af_bitxor)
+BINARY_HAPI_DEF(af_bitshiftl)
+BINARY_HAPI_DEF(af_bitshiftr)
+BINARY_HAPI_DEF(af_hypot)
+
+af_err af_cast(af_array *out, const af_array in, const af_dtype type)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, type);
+}
+
+#define UNARY_HAPI_DEF(af_func) \
+af_err af_func(af_array* out, const af_array in) \
+{ \
+    CHECK_ARRAYS(in); \
+    return CALL(out, in); \
+}
+
+UNARY_HAPI_DEF(af_abs)
+UNARY_HAPI_DEF(af_arg)
+UNARY_HAPI_DEF(af_sign)
+UNARY_HAPI_DEF(af_round)
+UNARY_HAPI_DEF(af_trunc)
+UNARY_HAPI_DEF(af_floor)
+UNARY_HAPI_DEF(af_ceil)
+UNARY_HAPI_DEF(af_sin)
+UNARY_HAPI_DEF(af_cos)
+UNARY_HAPI_DEF(af_tan)
+UNARY_HAPI_DEF(af_asin)
+UNARY_HAPI_DEF(af_acos)
+UNARY_HAPI_DEF(af_atan)
+UNARY_HAPI_DEF(af_cplx)
+UNARY_HAPI_DEF(af_real)
+UNARY_HAPI_DEF(af_imag)
+UNARY_HAPI_DEF(af_conjg)
+UNARY_HAPI_DEF(af_sinh)
+UNARY_HAPI_DEF(af_cosh)
+UNARY_HAPI_DEF(af_tanh)
+UNARY_HAPI_DEF(af_asinh)
+UNARY_HAPI_DEF(af_acosh)
+UNARY_HAPI_DEF(af_atanh)
+UNARY_HAPI_DEF(af_pow2)
+UNARY_HAPI_DEF(af_exp)
+UNARY_HAPI_DEF(af_sigmoid)
+UNARY_HAPI_DEF(af_expm1)
+UNARY_HAPI_DEF(af_erf)
+UNARY_HAPI_DEF(af_erfc)
+UNARY_HAPI_DEF(af_log)
+UNARY_HAPI_DEF(af_log1p)
+UNARY_HAPI_DEF(af_log10)
+UNARY_HAPI_DEF(af_log2)
+UNARY_HAPI_DEF(af_sqrt)
+UNARY_HAPI_DEF(af_cbrt)
+UNARY_HAPI_DEF(af_factorial)
+UNARY_HAPI_DEF(af_tgamma)
+UNARY_HAPI_DEF(af_lgamma)
+UNARY_HAPI_DEF(af_iszero)
+UNARY_HAPI_DEF(af_isinf)
+UNARY_HAPI_DEF(af_isnan)
+UNARY_HAPI_DEF(af_not)
diff --git a/src/api/unified/array.cpp b/src/api/unified/array.cpp
new file mode 100644
index 0000000000..59158ca195
--- /dev/null
+++ b/src/api/unified/array.cpp
@@ -0,0 +1,108 @@
+/*******************************************************
+ * Copyright(c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include "symbol_manager.hpp"
+
+af_err af_create_array(af_array *arr, const void * const data, const unsigned ndims, const dim_t * const dims, const af_dtype type)
+{
+    return CALL(arr, data, ndims, dims, type);
+}
+
+af_err af_create_handle(af_array *arr, const unsigned ndims, const dim_t * const dims, const af_dtype type)
+{
+    return CALL(arr, ndims, dims, type);
+}
+
+af_err af_copy_array(af_array *arr, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(arr, in);
+}
+
+af_err af_write_array(af_array arr, const void *data, const size_t bytes, af_source src)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(arr, data, bytes, src);
+}
+
+af_err af_get_data_ptr(void *data, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(data, arr);
+}
+
+af_err af_release_array(af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(arr);
+}
+
+af_err af_retain_array(af_array *out, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in);
+}
+
+af_err af_get_data_ref_count(int *use_count, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(use_count, in);
+}
+
+af_err af_eval(af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(in);
+}
+
+af_err af_get_elements(dim_t *elems, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(elems, arr);
+}
+
+af_err af_get_type(af_dtype *type, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(type, arr);
+}
+
+af_err af_get_dims(dim_t *d0, dim_t *d1, dim_t *d2, dim_t *d3, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(d0, d1, d2, d3, arr);
+}
+
+af_err af_get_numdims(unsigned *result, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(result, arr);
+}
+
+#define ARRAY_HAPI_DEF(af_func) \
+af_err af_func(bool *result, const af_array arr)\
+{\
+    CHECK_ARRAYS(arr); \
+    return CALL(result, arr);\
+}
+
+ARRAY_HAPI_DEF(af_is_empty)
+ARRAY_HAPI_DEF(af_is_scalar)
+ARRAY_HAPI_DEF(af_is_row)
+ARRAY_HAPI_DEF(af_is_column)
+ARRAY_HAPI_DEF(af_is_vector)
+ARRAY_HAPI_DEF(af_is_complex)
+ARRAY_HAPI_DEF(af_is_real)
+ARRAY_HAPI_DEF(af_is_double)
+ARRAY_HAPI_DEF(af_is_single)
+ARRAY_HAPI_DEF(af_is_realfloating)
+ARRAY_HAPI_DEF(af_is_floating)
+ARRAY_HAPI_DEF(af_is_integer)
+ARRAY_HAPI_DEF(af_is_bool)
diff --git a/src/api/unified/blas.cpp b/src/api/unified/blas.cpp
new file mode 100644
index 0000000000..547e3ac428
--- /dev/null
+++ b/src/api/unified/blas.cpp
@@ -0,0 +1,40 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/blas.h>
+#include "symbol_manager.hpp"
+
+af_err af_matmul( af_array *out ,
+        const af_array lhs, const af_array rhs,
+        const af_mat_prop optLhs, const af_mat_prop optRhs)
+{
+    CHECK_ARRAYS(lhs, rhs);
+    return CALL(out, lhs, rhs, optLhs, optRhs);
+}
+
+
+af_err af_dot(    af_array *out,
+        const af_array lhs, const af_array rhs,
+        const af_mat_prop optLhs, const af_mat_prop optRhs)
+{
+    CHECK_ARRAYS(lhs, rhs);
+    return CALL(out, lhs, rhs, optLhs, optRhs);
+}
+
+af_err af_transpose(af_array *out, af_array in, const bool conjugate)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, conjugate);
+}
+
+af_err af_transpose_inplace(af_array in, const bool conjugate)
+{
+    CHECK_ARRAYS(in);
+    return CALL(in, conjugate);
+}
diff --git a/src/api/unified/data.cpp b/src/api/unified/data.cpp
new file mode 100644
index 0000000000..236b11f7e2
--- /dev/null
+++ b/src/api/unified/data.cpp
@@ -0,0 +1,180 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/data.h>
+#include "symbol_manager.hpp"
+
+af_err af_constant(af_array *result, const double value,
+                   const unsigned ndims, const dim_t * const dims,
+                   const af_dtype type)
+{
+    return CALL(result, value, ndims, dims, type);
+}
+
+
+af_err af_constant_complex(af_array *arr, const double real, const double imag,
+        const unsigned ndims, const dim_t * const dims, const af_dtype type)
+{
+    return CALL(arr, real, imag, ndims, dims, type);
+}
+
+
+af_err af_constant_long (af_array *arr, const  intl val, const unsigned ndims, const dim_t * const dims)
+{
+    return CALL(arr, val, ndims, dims);
+}
+
+
+af_err af_constant_ulong(af_array *arr, const uintl val, const unsigned ndims, const dim_t * const dims)
+{
+    return CALL(arr, val, ndims, dims);
+}
+
+af_err af_range(af_array *out, const unsigned ndims, const dim_t * const dims,
+        const int seq_dim, const af_dtype type)
+{
+    return CALL(out, ndims, dims, seq_dim, type);
+}
+
+af_err af_iota(af_array *out, const unsigned ndims, const dim_t * const dims,
+        const unsigned t_ndims, const dim_t * const tdims, const af_dtype type)
+{
+    return CALL(out, ndims, dims, t_ndims, tdims, type);
+}
+
+af_err af_randu(af_array *out, const unsigned ndims, const dim_t * const dims, const af_dtype type)
+{
+    return CALL(out, ndims, dims, type);
+}
+
+af_err af_randn(af_array *out, const unsigned ndims, const dim_t * const dims, const af_dtype type)
+{
+    return CALL(out, ndims, dims, type);
+}
+
+af_err af_set_seed(const uintl seed)
+{
+    return CALL(seed);
+}
+
+af_err af_get_seed(uintl *seed)
+{
+    return CALL(seed);
+}
+
+af_err af_identity(af_array *out, const unsigned ndims, const dim_t * const dims, const af_dtype type)
+{
+    return CALL(out, ndims, dims, type);
+}
+
+af_err af_diag_create(af_array *out, const af_array in, const int num)
+{
+    CHECK_ARRAYS(in)
+    return CALL(out, in, num);
+}
+
+af_err af_diag_extract(af_array *out, const af_array in, const int num)
+{
+    CHECK_ARRAYS(in)
+    return CALL(out, in, num);
+}
+
+af_err af_join(af_array *out, const int dim, const af_array first, const af_array second)
+{
+    CHECK_ARRAYS(first, second)
+    return CALL(out, dim, first, second);
+}
+
+af_err af_join_many(af_array *out, const int dim, const unsigned n_arrays, const af_array *inputs)
+{
+    for(unsigned i = 0; i < n_arrays; i++)
+        CHECK_ARRAYS(inputs[i]);
+    return CALL(out, dim, n_arrays, inputs);
+}
+
+af_err af_tile(af_array *out, const af_array in,
+        const unsigned x, const unsigned y, const unsigned z, const unsigned w)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, x, y, z, w);
+}
+
+af_err af_reorder(af_array *out, const af_array in,
+        const unsigned x, const unsigned y, const unsigned z, const unsigned w)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, x, y, z, w);
+}
+
+af_err af_shift(af_array *out, const af_array in, const int x, const int y, const int z, const int w)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, x, y, z, w);
+}
+
+af_err af_moddims(af_array *out, const af_array in, const unsigned ndims, const dim_t * const dims)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, ndims, dims);
+}
+
+af_err af_flat(af_array *out, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in);
+}
+
+af_err af_flip(af_array *out, const af_array in, const unsigned dim)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, dim);
+}
+
+af_err af_lower(af_array *out, const af_array in, bool is_unit_diag)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, is_unit_diag);
+}
+
+af_err af_upper(af_array *out, const af_array in, bool is_unit_diag)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, is_unit_diag);
+}
+
+af_err af_select(af_array *out, const af_array cond, const af_array a, const af_array b)
+{
+    CHECK_ARRAYS(cond, a, b);
+    return CALL(out, cond, a, b);
+}
+
+af_err af_select_scalar_r(af_array *out, const af_array cond, const af_array a, const double b)
+{
+    CHECK_ARRAYS(cond, a);
+    return CALL(out, cond, a, b);
+}
+
+af_err af_select_scalar_l(af_array *out, const af_array cond, const double a, const af_array b)
+{
+    CHECK_ARRAYS(cond, b);
+    return CALL(out, cond, a, b);
+}
+
+af_err af_replace(af_array a, const af_array cond, const af_array b)
+{
+    CHECK_ARRAYS(a, cond, b);
+    return CALL(a, cond, b);
+}
+
+af_err af_replace_scalar(af_array a, const af_array cond, const double b)
+{
+    CHECK_ARRAYS(a, cond);
+    return CALL(a, cond, b);
+}
diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp
new file mode 100644
index 0000000000..43559a077a
--- /dev/null
+++ b/src/api/unified/device.cpp
@@ -0,0 +1,140 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/backend.h>
+#include <af/device.h>
+#include "symbol_manager.hpp"
+
+af_err af_set_backend(const af_backend bknd)
+{
+    return unified::AFSymbolManager::getInstance().setBackend(bknd);
+}
+
+af_err af_get_backend_count(unsigned* num_backends)
+{
+    *num_backends = unified::AFSymbolManager::getInstance().getBackendCount();
+    return AF_SUCCESS;
+}
+
+af_err af_get_available_backends(int* result)
+{
+    *result = unified::AFSymbolManager::getInstance().getAvailableBackends();
+    return AF_SUCCESS;
+}
+
+af_err af_get_backend_id(af_backend *result, const af_array in)
+{
+    // DO NOT CALL CHECK_ARRAYS HERE.
+    // IT WILL RESULT IN AN INFINITE RECURSION
+    return CALL(result, in);
+}
+
+af_err af_info()
+{
+    return CALL_NO_PARAMS();
+}
+
+af_err af_init()
+{
+    return CALL_NO_PARAMS();
+}
+
+af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute)
+{
+    return CALL(d_name, d_platform, d_toolkit, d_compute);
+}
+
+af_err af_get_device_count(int *num_of_devices)
+{
+    return CALL(num_of_devices);
+}
+
+af_err af_get_dbl_support(bool* available, const int device)
+{
+    return CALL(available, device);
+}
+
+af_err af_set_device(const int device)
+{
+    return CALL(device);
+}
+
+af_err af_get_device(int *device)
+{
+    return CALL(device);
+}
+
+af_err af_sync(const int device)
+{
+    return CALL(device);
+}
+
+af_err af_alloc_device(void **ptr, const dim_t bytes)
+{
+    return CALL(ptr, bytes);
+}
+
+af_err af_alloc_pinned(void **ptr, const dim_t bytes)
+{
+    return CALL(ptr, bytes);
+}
+
+af_err af_free_device(void *ptr)
+{
+    return CALL(ptr);
+}
+
+af_err af_free_pinned(void *ptr)
+{
+    return CALL(ptr);
+}
+
+af_err af_device_array(af_array *arr, const void *data, const unsigned ndims, const dim_t * const dims, const af_dtype type)
+{
+    return CALL(arr, data, ndims, dims, type);
+}
+
+af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers,
+        size_t *lock_bytes, size_t *lock_buffers)
+{
+    return CALL(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers);
+}
+
+af_err af_device_gc()
+{
+    return CALL_NO_PARAMS();
+}
+
+af_err af_set_mem_step_size(const size_t step_bytes)
+{
+    return CALL(step_bytes);
+}
+
+af_err af_get_mem_step_size(size_t *step_bytes)
+{
+    return CALL(step_bytes);
+}
+
+af_err af_lock_device_ptr(const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(arr);
+}
+
+af_err af_unlock_device_ptr(const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(arr);
+}
+
+af_err af_get_device_ptr(void **ptr, const af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(ptr, arr);
+}
diff --git a/src/api/unified/features.cpp b/src/api/unified/features.cpp
new file mode 100644
index 0000000000..5eac8f72bb
--- /dev/null
+++ b/src/api/unified/features.cpp
@@ -0,0 +1,44 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/features.h>
+#include "symbol_manager.hpp"
+
+af_err af_create_features(af_features *feat, dim_t num)
+{
+    return CALL(feat, num);
+}
+
+af_err af_retain_features(af_features *out, const af_features feat)
+{
+    return CALL(out, feat);
+}
+
+af_err af_get_features_num(dim_t *num, const af_features feat)
+{
+    return CALL(num, feat);
+}
+
+#define FEAT_HAPI_DEF(af_func)\
+af_err af_func(af_array *out, const af_features feat)\
+{\
+    return CALL(out, feat);\
+}
+
+FEAT_HAPI_DEF(af_get_features_xpos)
+FEAT_HAPI_DEF(af_get_features_ypos)
+FEAT_HAPI_DEF(af_get_features_score)
+FEAT_HAPI_DEF(af_get_features_orientation)
+FEAT_HAPI_DEF(af_get_features_size)
+
+af_err af_release_features(af_features feat)
+{
+    return CALL(feat);
+}
diff --git a/src/api/unified/graphics.cpp b/src/api/unified/graphics.cpp
new file mode 100644
index 0000000000..81076f233c
--- /dev/null
+++ b/src/api/unified/graphics.cpp
@@ -0,0 +1,83 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/graphics.h>
+#include "symbol_manager.hpp"
+
+
+af_err af_create_window(af_window *out, const int width, const int height, const char* const title)
+{
+    return CALL(out, width, height, title);
+}
+
+af_err af_set_position(const af_window wind, const unsigned x, const unsigned y)
+{
+    return CALL(wind, x, y);
+}
+
+af_err af_set_title(const af_window wind, const char* const title)
+{
+    return CALL(wind, title);
+}
+
+af_err af_set_size(const af_window wind, const unsigned w, const unsigned h)
+{
+    return CALL(wind, w, h);
+}
+
+af_err af_draw_image(const af_window wind, const af_array in, const af_cell* const props)
+{
+    CHECK_ARRAYS(in);
+    return CALL(wind, in, props);
+}
+
+af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props)
+{
+    CHECK_ARRAYS(X, Y);
+    return CALL(wind, X, Y, props);
+}
+
+af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props)
+{
+    CHECK_ARRAYS(P);
+    return CALL(wind, P, props);
+}
+
+af_err af_draw_hist(const af_window wind, const af_array X, const double minval, const double maxval, const af_cell* const props)
+{
+    CHECK_ARRAYS(X);
+    return CALL(wind, X, minval, maxval, props);
+}
+
+af_err af_draw_surface(const af_window wind, const af_array xVals, const af_array yVals, const af_array S, const af_cell* const props)
+{
+    CHECK_ARRAYS(xVals, yVals, S);
+    return CALL(wind, xVals, yVals, S, props);
+}
+
+af_err af_grid(const af_window wind, const int rows, const int cols)
+{
+    return CALL(wind, rows, cols);
+}
+
+af_err af_show(const af_window wind)
+{
+    return CALL(wind);
+}
+
+af_err af_is_window_closed(bool *out, const af_window wind)
+{
+    return CALL(out, wind);
+}
+
+af_err af_destroy_window(const af_window wind)
+{
+    return CALL(wind);
+}
diff --git a/src/api/unified/image.cpp b/src/api/unified/image.cpp
new file mode 100644
index 0000000000..d0f9aa6200
--- /dev/null
+++ b/src/api/unified/image.cpp
@@ -0,0 +1,252 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/image.h>
+#include "symbol_manager.hpp"
+
+af_err af_gradient(af_array *dx, af_array *dy, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(dx, dy, in);
+}
+
+af_err af_load_image(af_array *out, const char* filename, const bool isColor)
+{
+    return CALL(out, filename, isColor);
+}
+
+af_err af_save_image(const char* filename, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(filename, in);
+}
+
+af_err af_load_image_memory(af_array *out, const void* ptr)
+{
+    return CALL(out, ptr);
+}
+
+af_err af_save_image_memory(void** ptr, const af_array in, const af_image_format format)
+{
+    CHECK_ARRAYS(in);
+    return CALL(ptr, in, format);
+}
+
+af_err af_delete_image_memory(void* ptr)
+{
+    return CALL(ptr);
+}
+
+af_err af_load_image_native(af_array *out, const char* filename)
+{
+    return CALL(out, filename);
+}
+
+af_err af_save_image_native(const char* filename, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(filename, in);
+}
+
+af_err af_resize(af_array *out, const af_array in, const dim_t odim0, const dim_t odim1, const af_interp_type method)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, odim0, odim1, method);
+}
+
+af_err af_transform(af_array *out, const af_array in, const af_array transform,
+        const dim_t odim0, const dim_t odim1,
+        const af_interp_type method, const bool inverse)
+{
+    CHECK_ARRAYS(in, transform);
+    return CALL(out, in, transform, odim0, odim1, method, inverse);
+}
+
+af_err af_rotate(af_array *out, const af_array in, const float theta,
+        const bool crop, const af_interp_type method)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, theta, crop, method);
+}
+
+af_err af_translate(af_array *out, const af_array in, const float trans0, const float trans1,
+        const dim_t odim0, const dim_t odim1, const af_interp_type method)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, trans0, trans1, odim0, odim1, method);
+}
+
+af_err af_scale(af_array *out, const af_array in, const float scale0, const float scale1,
+        const dim_t odim0, const dim_t odim1, const af_interp_type method)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, scale0, scale1, odim0, odim1, method);
+}
+
+af_err af_skew(af_array *out, const af_array in, const float skew0, const float skew1,
+        const dim_t odim0, const dim_t odim1, const af_interp_type method,
+        const bool inverse)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, skew0, skew1, odim0, odim1, method, inverse);
+}
+
+af_err af_histogram(af_array *out, const af_array in, const unsigned nbins, const double minval, const double maxval)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, nbins, minval, maxval);
+}
+
+af_err af_dilate(af_array *out, const af_array in, const af_array mask)
+{
+    CHECK_ARRAYS(in, mask);
+    return CALL(out, in, mask);
+}
+
+af_err af_dilate3(af_array *out, const af_array in, const af_array mask)
+{
+    CHECK_ARRAYS(in, mask);
+    return CALL(out, in, mask);
+}
+
+af_err af_erode(af_array *out, const af_array in, const af_array mask)
+{
+    CHECK_ARRAYS(in, mask);
+    return CALL(out, in, mask);
+}
+
+af_err af_erode3(af_array *out, const af_array in, const af_array mask)
+{
+    CHECK_ARRAYS(in, mask);
+    return CALL(out, in, mask);
+}
+
+af_err af_bilateral(af_array *out, const af_array in, const float spatial_sigma, const float chromatic_sigma, const bool isColor)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, spatial_sigma, chromatic_sigma, isColor);
+}
+
+af_err af_mean_shift(af_array *out, const af_array in, const float spatial_sigma, const float chromatic_sigma, const unsigned iter, const bool is_color)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, spatial_sigma, chromatic_sigma, iter, is_color);
+}
+
+af_err af_medfilt(af_array *out, const af_array in, const dim_t wind_length, const dim_t wind_width, const af_border_type edge_pad)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, wind_length, wind_width, edge_pad);
+}
+
+af_err af_minfilt(af_array *out, const af_array in, const dim_t wind_length, const dim_t wind_width, const af_border_type edge_pad)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, wind_length, wind_width, edge_pad);
+}
+
+af_err af_maxfilt(af_array *out, const af_array in, const dim_t wind_length, const dim_t wind_width, const af_border_type edge_pad)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, wind_length, wind_width, edge_pad);
+}
+
+af_err af_regions(af_array *out, const af_array in, const af_connectivity connectivity, const af_dtype ty)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, connectivity, ty);
+}
+
+af_err af_sobel_operator(af_array *dx, af_array *dy, const af_array img, const unsigned ker_size)
+{
+    CHECK_ARRAYS(img);
+    return CALL(dx, dy, img, ker_size);
+}
+
+af_err af_rgb2gray(af_array* out, const af_array in, const float rPercent, const float gPercent, const float bPercent)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, rPercent, gPercent, bPercent);
+}
+
+af_err af_gray2rgb(af_array* out, const af_array in, const float rFactor, const float gFactor, const float bFactor)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, rFactor, gFactor, bFactor);
+}
+
+af_err af_hist_equal(af_array *out, const af_array in, const af_array hist)
+{
+    CHECK_ARRAYS(in, hist);
+    return CALL(out, in, hist);
+}
+
+af_err af_gaussian_kernel(af_array *out,
+        const int rows, const int cols,
+        const double sigma_r, const double sigma_c)
+{
+    return CALL(out, rows, cols, sigma_r, sigma_c);
+}
+
+af_err af_hsv2rgb(af_array* out, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in);
+}
+
+af_err af_rgb2hsv(af_array* out, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in);
+}
+
+af_err af_color_space(af_array *out, const af_array image, const af_cspace_t to, const af_cspace_t from)
+{
+    CHECK_ARRAYS(image);
+    return CALL(out, image, to, from);
+}
+
+af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
+        const dim_t sx, const dim_t sy, const dim_t px, const dim_t py,
+        const bool is_column)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, wx, wy, sx, sy, px, py, is_column);
+}
+
+af_err af_wrap(af_array *out,
+        const af_array in,
+        const dim_t ox, const dim_t oy,
+        const dim_t wx, const dim_t wy,
+        const dim_t sx, const dim_t sy,
+        const dim_t px, const dim_t py,
+        const bool is_column)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, ox, oy, wx, wy, sx, sy, px, py, is_column);
+}
+
+af_err af_sat(af_array *out, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in);
+}
+
+af_err af_ycbcr2rgb(af_array* out, const af_array in, const af_ycc_std standard)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, standard);
+}
+
+af_err af_rgb2ycbcr(af_array* out, const af_array in, const af_ycc_std standard)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, standard);
+}
diff --git a/src/api/unified/index.cpp b/src/api/unified/index.cpp
new file mode 100644
index 0000000000..0927dd8b71
--- /dev/null
+++ b/src/api/unified/index.cpp
@@ -0,0 +1,54 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/index.h>
+#include "symbol_manager.hpp"
+
+af_err af_index(  af_array *out,
+        const af_array in,
+        const unsigned ndims, const af_seq* const index)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, ndims, index);
+}
+
+af_err af_lookup( af_array *out,
+        const af_array in, const af_array indices,
+        const unsigned dim)
+{
+    CHECK_ARRAYS(in, indices);
+    return CALL(out, in, indices, dim);
+}
+
+af_err af_assign_seq( af_array *out,
+        const af_array lhs,
+        const unsigned ndims, const af_seq* const indices,
+        const af_array rhs)
+{
+    CHECK_ARRAYS(lhs, rhs);
+    return CALL(out, lhs, ndims, indices, rhs);
+}
+
+af_err af_index_gen(  af_array *out,
+        const af_array in,
+        const dim_t ndims, const af_index_t* indices)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, ndims, indices);
+}
+
+af_err af_assign_gen( af_array *out,
+        const af_array lhs,
+        const dim_t ndims, const af_index_t* indices,
+        const af_array rhs)
+{
+    CHECK_ARRAYS(lhs, rhs);
+    return CALL(out, lhs, ndims, indices, rhs);
+}
diff --git a/src/api/unified/lapack.cpp b/src/api/unified/lapack.cpp
new file mode 100644
index 0000000000..b2364ac858
--- /dev/null
+++ b/src/api/unified/lapack.cpp
@@ -0,0 +1,98 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/lapack.h>
+#include "symbol_manager.hpp"
+
+af_err af_svd(af_array *u, af_array *s, af_array *vt, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(u, s, vt, in);
+}
+
+af_err af_svd_inplace(af_array *u, af_array *s, af_array *vt, af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(u, s, vt, in);
+}
+
+af_err af_lu(af_array *lower, af_array *upper, af_array *pivot, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(lower, upper, pivot, in);
+}
+
+af_err af_lu_inplace(af_array *pivot, af_array in, const bool is_lapack_piv)
+{
+    CHECK_ARRAYS(in);
+    return CALL(pivot, in, is_lapack_piv);
+}
+
+af_err af_qr(af_array *q, af_array *r, af_array *tau, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(q, r, tau, in);
+}
+
+af_err af_qr_inplace(af_array *tau, af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(tau, in);
+}
+
+af_err af_cholesky(af_array *out, int *info, const af_array in, const bool is_upper)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, info, in, is_upper);
+}
+
+af_err af_cholesky_inplace(int *info, af_array in, const bool is_upper)
+{
+    CHECK_ARRAYS(in);
+    return CALL(info, in, is_upper);
+}
+
+af_err af_solve(af_array *x, const af_array a, const af_array b,
+        const af_mat_prop options)
+{
+    CHECK_ARRAYS(a, b);
+    return CALL(x, a, b, options);
+}
+
+af_err af_solve_lu(af_array *x, const af_array a, const af_array piv,
+        const af_array b, const af_mat_prop options)
+{
+    CHECK_ARRAYS(a, piv, b);
+    return CALL(x, a, piv, b, options);
+}
+
+af_err af_inverse(af_array *out, const af_array in, const af_mat_prop options)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, options);
+}
+
+af_err af_rank(unsigned *rank, const af_array in, const double tol)
+{
+    CHECK_ARRAYS(in);
+    return CALL(rank, in, tol);
+}
+
+af_err af_det(double *det_real, double *det_imag, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(det_real, det_imag, in);
+}
+
+af_err af_norm(double *out, const af_array in, const af_norm_type type, const double p, const double q)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, type, p, q);
+}
diff --git a/src/api/unified/signal.cpp b/src/api/unified/signal.cpp
new file mode 100644
index 0000000000..138a0d6905
--- /dev/null
+++ b/src/api/unified/signal.cpp
@@ -0,0 +1,143 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/signal.h>
+#include "symbol_manager.hpp"
+
+af_err af_approx1(af_array *out, const af_array in, const af_array pos, const af_interp_type method, const float offGrid)
+{
+    CHECK_ARRAYS(in, pos);
+    return CALL(out, in, pos, method, offGrid);
+}
+
+af_err af_approx2(af_array *out, const af_array in, const af_array pos0, const af_array pos1, const af_interp_type method, const float offGrid)
+{
+    CHECK_ARRAYS(in, pos0, pos1);
+    return CALL(out, in, pos0, pos1, method, offGrid);
+}
+
+#define FFT_HAPI_DEF(af_func)\
+af_err af_func(af_array in, const double norm_factor)\
+{\
+    CHECK_ARRAYS(in); \
+    return CALL(in, norm_factor);\
+}
+
+FFT_HAPI_DEF(af_fft_inplace)
+FFT_HAPI_DEF(af_fft2_inplace)
+FFT_HAPI_DEF(af_fft3_inplace)
+FFT_HAPI_DEF(af_ifft_inplace)
+FFT_HAPI_DEF(af_ifft2_inplace)
+FFT_HAPI_DEF(af_ifft3_inplace)
+
+af_err af_fft(af_array *out, const af_array in, const double norm_factor, const dim_t odim0)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, odim0);
+}
+
+af_err af_fft2(af_array *out, const af_array in, const double norm_factor, const dim_t odim0, const dim_t odim1)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, odim0, odim1);
+}
+
+af_err af_fft3(af_array *out, const af_array in, const double norm_factor, const dim_t odim0, const dim_t odim1, const dim_t odim2)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, odim0, odim1, odim2);
+}
+
+af_err af_ifft(af_array *out, const af_array in, const double norm_factor, const dim_t odim0)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, odim0);
+}
+
+af_err af_ifft2(af_array *out, const af_array in, const double norm_factor, const dim_t odim0, const dim_t odim1)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, odim0, odim1);
+}
+
+af_err af_ifft3(af_array *out, const af_array in, const double norm_factor, const dim_t odim0, const dim_t odim1, const dim_t odim2)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, odim0, odim1, odim2);
+}
+
+af_err af_fft_r2c (af_array *out, const af_array in, const double norm_factor, const dim_t pad0)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, pad0);
+}
+
+af_err af_fft2_r2c(af_array *out, const af_array in, const double norm_factor, const dim_t pad0, const dim_t pad1)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, pad0, pad1);
+}
+
+af_err af_fft3_r2c(af_array *out, const af_array in, const double norm_factor, const dim_t pad0, const dim_t pad1, const dim_t pad2)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, norm_factor, pad0, pad1, pad2);
+}
+
+#define FFTC2R_HAPI_DEF(af_func)\
+af_err af_func(af_array *out, const af_array in, const double norm_factor, const bool is_odd)\
+{\
+    CHECK_ARRAYS(in); \
+    return CALL(out, in, norm_factor, is_odd);\
+}
+
+FFTC2R_HAPI_DEF(af_fft_c2r)
+FFTC2R_HAPI_DEF(af_fft2_c2r)
+FFTC2R_HAPI_DEF(af_fft3_c2r)
+
+#define CONV_HAPI_DEF(af_func)\
+af_err af_func(af_array *out, const af_array signal, const af_array filter, const af_conv_mode mode, af_conv_domain domain)\
+{\
+    CHECK_ARRAYS(signal, filter); \
+    return CALL(out, signal, filter, mode, domain);\
+}
+
+CONV_HAPI_DEF(af_convolve1)
+CONV_HAPI_DEF(af_convolve2)
+CONV_HAPI_DEF(af_convolve3)
+
+#define FFT_CONV_HAPI_DEF(af_func)\
+af_err af_func(af_array *out, const af_array signal, const af_array filter, const af_conv_mode mode)\
+{\
+    CHECK_ARRAYS(signal, filter); \
+    return CALL(out, signal, filter, mode);\
+}
+
+FFT_CONV_HAPI_DEF(af_fft_convolve1)
+FFT_CONV_HAPI_DEF(af_fft_convolve2)
+FFT_CONV_HAPI_DEF(af_fft_convolve3)
+
+af_err af_convolve2_sep(af_array *out, const af_array col_filter, const af_array row_filter, const af_array signal, const af_conv_mode mode)
+{
+    CHECK_ARRAYS(col_filter, row_filter, signal);
+    return CALL(out, col_filter, row_filter, signal, mode);
+}
+
+af_err af_fir(af_array *y, const af_array b, const af_array x)
+{
+    CHECK_ARRAYS(b, x);
+    return CALL(y, b, x);
+}
+
+af_err af_iir(af_array *y, const af_array b, const af_array a, const af_array x)
+{
+    CHECK_ARRAYS(b, a, x);
+    return CALL(y, b, a, x);
+}
diff --git a/src/api/unified/statistics.cpp b/src/api/unified/statistics.cpp
new file mode 100644
index 0000000000..9f72674d04
--- /dev/null
+++ b/src/api/unified/statistics.cpp
@@ -0,0 +1,96 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/statistics.h>
+#include "symbol_manager.hpp"
+
+af_err af_mean(af_array *out, const af_array in, const dim_t dim)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, dim);
+}
+
+af_err af_mean_weighted(af_array *out, const af_array in, const af_array weights, const dim_t dim)
+{
+    CHECK_ARRAYS(in, weights);
+    return CALL(out, in, weights, dim);
+}
+
+af_err af_var(af_array *out, const af_array in, const bool isbiased, const dim_t dim)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, isbiased, dim);
+}
+
+af_err af_var_weighted(af_array *out, const af_array in, const af_array weights, const dim_t dim)
+{
+    CHECK_ARRAYS(in, weights);
+    return CALL(out, in, weights, dim);
+}
+
+af_err af_stdev(af_array *out, const af_array in, const dim_t dim)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, dim);
+}
+
+af_err af_cov(af_array* out, const af_array X, const af_array Y, const bool isbiased)
+{
+    CHECK_ARRAYS(X, Y);
+    return CALL(out, X, Y, isbiased);
+}
+
+af_err af_median(af_array* out, const af_array in, const dim_t dim)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, dim);
+}
+
+af_err af_mean_all(double *real, double *imag, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(real, imag, in);
+}
+
+af_err af_mean_all_weighted(double *real, double *imag, const af_array in, const af_array weights)
+{
+    CHECK_ARRAYS(in, weights);
+    return CALL(real, imag, in, weights);
+}
+
+af_err af_var_all(double *realVal, double *imagVal, const af_array in, const bool isbiased)
+{
+    CHECK_ARRAYS(in);
+    return CALL(realVal, imagVal, in, isbiased);
+}
+
+af_err af_var_all_weighted(double *realVal, double *imagVal, const af_array in, const af_array weights)
+{
+    CHECK_ARRAYS(in, weights);
+    return CALL(realVal, imagVal, in, weights);
+}
+
+af_err af_stdev_all(double *real, double *imag, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(real, imag, in);
+}
+
+af_err af_median_all(double *realVal, double *imagVal, const af_array in)
+{
+    CHECK_ARRAYS(in);
+    return CALL(realVal, imagVal, in);
+}
+
+af_err af_corrcoef(double *realVal, double *imagVal, const af_array X, const af_array Y)
+{
+    CHECK_ARRAYS(X, Y);
+    return CALL(realVal, imagVal, X, Y);
+}
diff --git a/src/api/unified/symbol_manager.cpp b/src/api/unified/symbol_manager.cpp
new file mode 100644
index 0000000000..1139f99b3e
--- /dev/null
+++ b/src/api/unified/symbol_manager.cpp
@@ -0,0 +1,223 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include "symbol_manager.hpp"
+#include <algorithm>
+#include <string>
+#include <cmath>
+
+using std::string;
+using std::replace;
+
+namespace unified
+{
+
+static const string LIB_AF_BKND_NAME[NUM_BACKENDS] = {"cpu", "cuda", "opencl"};
+#if defined(OS_WIN)
+static const string LIB_AF_BKND_PREFIX = "af";
+static const string LIB_AF_BKND_SUFFIX = ".dll";
+#define RTLD_LAZY 0
+#else
+static const string LIB_AF_BKND_PREFIX = "libaf";
+#if defined(__APPLE__)
+static const string LIB_AF_BKND_SUFFIX = ".dylib";
+#else
+static const string LIB_AF_BKND_SUFFIX = ".so";
+#endif // APPLE
+#endif
+
+static const string LIB_AF_ENVARS[NUM_ENV_VARS] = {"AF_PATH", "AF_BUILD_PATH"};
+static const string LIB_AF_RPATHS[NUM_ENV_VARS] = {"/lib/", "/src/backend/"};
+static const bool LIB_AF_RPATH_SUFFIX[NUM_ENV_VARS] = {false, true};
+
+inline string getBkndLibName(const int backend_index)
+{
+    int i = backend_index >=0 && backend_index<NUM_BACKENDS ? backend_index : 0;
+    return LIB_AF_BKND_PREFIX + LIB_AF_BKND_NAME[i] + LIB_AF_BKND_SUFFIX;
+}
+
+inline std::string getEnvVar(const std::string &key)
+{
+#if defined(OS_WIN)
+    DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation
+    string retVal;
+    retVal.resize(bufSize);
+    bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize);
+    if (!bufSize) {
+        return string("");
+    } else {
+        retVal.resize(bufSize);
+        return retVal;
+    }
+#else
+    char * str = getenv(key.c_str());
+    return str==NULL ? string("") : string(str);
+#endif
+}
+
+/*flag parameter is not used on windows platform */
+LibHandle openDynLibrary(const int bknd_idx, int flag=RTLD_LAZY)
+{
+    /*
+     * The default search path is the colon separated list of
+     * paths stored in the environment variables:
+     * * LD_LIBRARY_PATH(Linux/Unix/Apple)
+     * * DYLD_LIBRARY_PATH (Apple)
+     * * PATH (Windows)
+    */
+    string bkndName = getBkndLibName(bknd_idx);
+    string show_flag = getEnvVar("AF_SHOW_LOAD_PATH");
+    bool show_load_path = show_flag=="1";
+
+#if defined(OS_WIN)
+    HMODULE retVal = LoadLibrary(bkndName.c_str());
+#else
+    LibHandle retVal = dlopen(bkndName.c_str(), flag);
+#endif
+    if(retVal != NULL) { // Success
+        if (show_load_path)
+            printf("Using %s from system path\n", bkndName.c_str());
+    } else {
+        /*
+         * In the event that dlopen returns NULL, search for the lib
+         * in hard coded paths based on the environment variables
+         * defined in the constant string array LIB_AF_PATHS
+         * * AF_PATH
+         * * AF_BUILD_PATH
+         *
+         * Note: This does not guarantee successful loading as the dependent
+         * libraries may still not load
+        */
+        for (int i=0; i<NUM_ENV_VARS; ++i) {
+            string abs_path = getEnvVar(LIB_AF_ENVARS[i])
+                                 + LIB_AF_RPATHS[i]
+                                 + (LIB_AF_RPATH_SUFFIX[i] ? LIB_AF_BKND_NAME[bknd_idx]+"/" : "")
+                                 + bkndName;
+#if defined(OS_WIN)
+            replace(abs_path.begin(), abs_path.end(), '/', '\\');
+            retVal = LoadLibrary(abs_path.c_str());
+#else
+            retVal = dlopen(abs_path.c_str(), flag);
+#endif
+            if (retVal!=NULL) {
+                if (show_load_path)
+                    printf("Using %s\n", abs_path.c_str());
+                // if the current absolute path based dlopen
+                // search is a success, then abandon search
+                // and proceed for compute
+                break;
+            }
+        }
+    }
+    return retVal;
+}
+
+void closeDynLibrary(LibHandle handle)
+{
+#if defined(OS_WIN)
+    FreeLibrary(handle);
+#else
+    dlclose(handle);
+#endif
+}
+
+AFSymbolManager& AFSymbolManager::getInstance()
+{
+    static AFSymbolManager symbolManager;
+    return symbolManager;
+}
+
+AFSymbolManager::AFSymbolManager()
+    : activeHandle(NULL), defaultHandle(NULL), numBackends(0), backendsAvailable(0)
+{
+    // In order of priority.
+    static const int order[] = {AF_BACKEND_CUDA,        // 1 -> Most Preferred
+                                AF_BACKEND_OPENCL,      // 4 -> Preferred if CUDA unavailable
+                                AF_BACKEND_CPU};        // 2 -> Preferred if CUDA and OpenCL unavailable
+
+    // Decremeting loop. The last successful backend loaded will be the most prefered one.
+    for(int i = NUM_BACKENDS - 1; i >= 0; i--) {
+        int backend = order[i] >> 1;    // Convert order[1, 4, 2] -> backend[0, 2, 1]
+        bkndHandles[backend] = openDynLibrary(backend);
+        if (bkndHandles[backend]) {
+            activeHandle = bkndHandles[backend];
+            activeBackend = (af_backend)order[i];
+            numBackends++;
+            backendsAvailable += order[i];
+        }
+    }
+    // Keep a copy of default order handle
+    // inorder to use it in ::setBackend when
+    // the user passes AF_BACKEND_DEFAULT
+    defaultHandle = activeHandle;
+    defaultBackend = activeBackend;
+}
+
+AFSymbolManager::~AFSymbolManager()
+{
+    for(int i=0; i<NUM_BACKENDS; ++i) {
+        if (bkndHandles[i]) {
+            closeDynLibrary(bkndHandles[i]);
+        }
+    }
+}
+
+unsigned AFSymbolManager::getBackendCount()
+{
+    return numBackends;
+}
+
+int AFSymbolManager::getAvailableBackends()
+{
+    return backendsAvailable;
+}
+
+af_err AFSymbolManager::setBackend(af::Backend bknd)
+{
+    if (bknd==AF_BACKEND_DEFAULT) {
+        if (defaultHandle) {
+            activeHandle = defaultHandle;
+            activeBackend = defaultBackend;
+            return AF_SUCCESS;
+        } else
+            return AF_ERR_LOAD_LIB;
+    }
+    int idx = bknd >> 1;    // Convert 1, 2, 4 -> 0, 1, 2
+    if(bkndHandles[idx]) {
+        activeHandle = bkndHandles[idx];
+        activeBackend = bknd;
+        return AF_SUCCESS;
+    } else {
+        return AF_ERR_LOAD_LIB;
+    }
+}
+
+bool checkArray(af_backend activeBackend, af_array a)
+{
+    // Convert af_array into int to retrieve the backend info.
+    // See ArrayInfo.hpp for more
+    af_backend backend = (af_backend)0;
+
+    // This condition is required so that the invalid args tests for unified
+    // backend return the expected error rather than AF_ERR_ARR_BKND_MISMATCH
+    // Since a = 0, does not have a backend specified, it should be a
+    // AF_ERR_ARG instead of AF_ERR_ARR_BKND_MISMATCH
+    if(a == 0) return true;
+
+    unified::AFSymbolManager::getInstance().call("af_get_backend_id", &backend, a);
+    return backend == activeBackend;
+}
+
+bool checkArrays(af_backend activeBackend)
+{
+    // Dummy
+    return true;
+}
+
+} // namespace unified
diff --git a/src/api/unified/symbol_manager.hpp b/src/api/unified/symbol_manager.hpp
new file mode 100644
index 0000000000..f4cf913ac6
--- /dev/null
+++ b/src/api/unified/symbol_manager.hpp
@@ -0,0 +1,108 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+#pragma once
+
+#include <af/defines.h>
+#include <string>
+#include <stdlib.h>
+#if defined(OS_WIN)
+#include <Windows.h>
+typedef HMODULE LibHandle;
+#else
+#include <dlfcn.h>
+typedef void* LibHandle;
+#endif
+
+namespace unified
+{
+
+const int NUM_BACKENDS = 3;
+const int NUM_ENV_VARS = 2;
+
+class AFSymbolManager {
+    public:
+        static AFSymbolManager& getInstance();
+
+        ~AFSymbolManager();
+
+        unsigned getBackendCount();
+
+        int getAvailableBackends();
+
+        af_err setBackend(af::Backend bnkd);
+
+        af::Backend getActiveBackend() { return activeBackend; }
+
+        template<typename... CalleeArgs>
+        af_err call(const char* symbolName, CalleeArgs... args) {
+            if (!activeHandle)
+                return AF_ERR_LOAD_LIB;
+            typedef af_err(*af_func)(CalleeArgs...);
+            af_func funcHandle;
+#if defined(OS_WIN)
+            funcHandle = (af_func)GetProcAddress(activeHandle, symbolName);
+#else
+            funcHandle = (af_func)dlsym(activeHandle, symbolName);
+#endif
+            if (!funcHandle) {
+                return AF_ERR_LOAD_SYM;
+            }
+
+            return funcHandle(args...);
+        }
+
+    protected:
+        AFSymbolManager();
+
+        // Following two declarations are required to
+        // avoid copying accidental copy/assignment
+        // of instance returned by getInstance to other
+        // variables
+        AFSymbolManager(AFSymbolManager const&);
+        void operator=(AFSymbolManager const&);
+
+    private:
+
+        LibHandle bkndHandles[NUM_BACKENDS];
+
+        LibHandle activeHandle;
+        LibHandle defaultHandle;
+        unsigned numBackends;
+        int backendsAvailable;
+        af_backend activeBackend;
+        af_backend defaultBackend;
+};
+
+// Helper functions to ensure all the input arrays are on the active backend
+bool checkArray(af_backend activeBackend, af_array a);
+bool checkArrays(af_backend activeBackend);
+
+template<typename T, typename... Args>
+bool checkArrays(af_backend activeBackend, T a, Args... arg)
+{
+    return checkArray(activeBackend, a) && checkArrays(activeBackend, arg...);
+}
+
+} // namespace unified
+
+// Macro to check af_array as inputs. The arguments to this macro should be
+// only input af_arrays. Not outputs or other types.
+#define CHECK_ARRAYS(...) do {                                                              \
+    af_backend backendId = unified::AFSymbolManager::getInstance().getActiveBackend();      \
+    if(!unified::checkArrays(backendId, __VA_ARGS__))                                       \
+        return AF_ERR_ARR_BKND_MISMATCH;                                                    \
+} while(0);
+
+#if defined(OS_WIN)
+#define CALL(...) unified::AFSymbolManager::getInstance().call(__FUNCTION__, __VA_ARGS__)
+#define CALL_NO_PARAMS() unified::AFSymbolManager::getInstance().call(__FUNCTION__)
+#else
+#define CALL(...) unified::AFSymbolManager::getInstance().call(__func__, __VA_ARGS__)
+#define CALL_NO_PARAMS() unified::AFSymbolManager::getInstance().call(__func__)
+#endif
diff --git a/src/api/unified/util.cpp b/src/api/unified/util.cpp
new file mode 100644
index 0000000000..155c4f81b9
--- /dev/null
+++ b/src/api/unified/util.cpp
@@ -0,0 +1,63 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/util.h>
+#include "symbol_manager.hpp"
+
+af_err af_print_array(af_array arr)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(arr);
+}
+
+af_err af_print_array_gen(const char *exp, const af_array arr, const int precision)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(exp, arr, precision);
+}
+
+af_err af_save_array(int *index, const char* key, const af_array arr, const char *filename, const bool append)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(index, key, arr, filename, append);
+}
+
+af_err af_read_array_index(af_array *out, const char *filename, const unsigned index)
+{
+    return CALL(out, filename, index);
+}
+
+af_err af_read_array_key(af_array *out, const char *filename, const char* key)
+{
+    return CALL(out, filename, key);
+}
+
+af_err af_read_array_key_check(int *index, const char *filename, const char* key)
+{
+    return CALL(index, filename, key);
+}
+
+af_err af_array_to_string(char **output, const char *exp, const af_array arr,
+        const int precision, const bool transpose)
+{
+    CHECK_ARRAYS(arr);
+    return CALL(output, exp, arr, precision, transpose);
+}
+
+af_err af_example_function(af_array* out, const af_array in, const af_someenum_t param)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, param);
+}
+
+af_err af_get_version(int *major, int *minor, int *patch)
+{
+    return CALL(major, minor, patch);
+}
diff --git a/src/api/unified/vision.cpp b/src/api/unified/vision.cpp
new file mode 100644
index 0000000000..cc43a61ed4
--- /dev/null
+++ b/src/api/unified/vision.cpp
@@ -0,0 +1,86 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/array.h>
+#include <af/vision.h>
+#include "symbol_manager.hpp"
+
+af_err af_fast(af_features *out, const af_array in, const float thr, const unsigned arc_length, const bool non_max, const float feature_ratio, const unsigned edge)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, thr, arc_length, non_max, feature_ratio, edge);
+}
+
+af_err af_harris(af_features *out, const af_array in, const unsigned max_corners, const float min_response, const float sigma, const unsigned block_size, const float k_thr)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, max_corners, min_response, sigma, block_size, k_thr);
+}
+
+af_err af_orb(af_features *feat, af_array *desc, const af_array in, const float fast_thr, const unsigned max_feat, const float scl_fctr, const unsigned levels, const bool blur_img)
+{
+    CHECK_ARRAYS(in);
+    return CALL(feat, desc, in, fast_thr, max_feat, scl_fctr, levels, blur_img);
+}
+
+af_err af_sift(af_features *feat, af_array *desc, const af_array in, const unsigned n_layers, const float contrast_thr, const float edge_thr, const float init_sigma, const bool double_input, const float intensity_scale, const float feature_ratio)
+{
+    CHECK_ARRAYS(in);
+    return CALL(feat, desc, in, n_layers, contrast_thr, edge_thr, init_sigma, double_input, intensity_scale, feature_ratio);
+}
+
+af_err af_gloh(af_features *feat, af_array *desc, const af_array in, const unsigned n_layers, const float contrast_thr, const float edge_thr, const float init_sigma, const bool double_input, const float intensity_scale, const float feature_ratio)
+{
+    CHECK_ARRAYS(in);
+    return CALL(feat, desc, in, n_layers, contrast_thr, edge_thr, init_sigma, double_input, intensity_scale, feature_ratio);
+}
+
+af_err af_hamming_matcher(af_array* idx, af_array* dist,
+        const af_array query, const af_array train,
+        const dim_t dist_dim, const unsigned n_dist)
+{
+    CHECK_ARRAYS(query, train);
+    return CALL(idx, dist, query, train, dist_dim, n_dist);
+}
+
+af_err af_nearest_neighbour(af_array* idx, af_array* dist,
+        const af_array query, const af_array train,
+        const dim_t dist_dim, const unsigned n_dist,
+        const af_match_type dist_type)
+{
+    CHECK_ARRAYS(query, train);
+    return CALL(idx, dist, query, train, dist_dim, n_dist, dist_type);
+}
+
+af_err af_match_template(af_array *out, const af_array search_img, const af_array template_img, const af_match_type m_type)
+{
+    CHECK_ARRAYS(search_img, template_img);
+    return CALL(out, search_img, template_img, m_type);
+}
+
+af_err af_susan(af_features* out, const af_array in, const unsigned radius, const float diff_thr, const float geom_thr,
+        const float feature_ratio, const unsigned edge)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, radius, diff_thr, geom_thr, feature_ratio, edge);
+}
+
+af_err af_dog(af_array *out, const af_array in, const int radius1, const int radius2)
+{
+    CHECK_ARRAYS(in);
+    return CALL(out, in, radius1, radius2);
+}
+
+af_err af_homography(af_array *H, int *inliers, const af_array x_src, const af_array y_src,
+                     const af_array x_dst, const af_array y_dst, const af_homography_type htype,
+                     const float inlier_thr, const unsigned iterations, const af_dtype type)
+{
+    CHECK_ARRAYS(x_src, y_src, x_dst, y_dst);
+    return CALL(H, inliers, x_src, y_src, x_dst, y_dst, htype, inlier_thr, iterations, type);
+}
diff --git a/src/backend/ArrayInfo.cpp b/src/backend/ArrayInfo.cpp
index 20c5bd88e2..219bc1991c 100644
--- a/src/backend/ArrayInfo.cpp
+++ b/src/backend/ArrayInfo.cpp
@@ -13,6 +13,9 @@
 #include <functional>
 #include <err_common.hpp>
 
+#include <backend.hpp>
+#include <platform.hpp>
+
 using af::dim4;
 
 dim_t
@@ -57,6 +60,40 @@ dim4 calcStrides(const dim4 &parentDim)
     return out;
 }
 
+int ArrayInfo::getDevId() const
+{
+    // The actual device ID is only stored in the first 4 bits of devId
+    // See ArrayInfo.hpp for more
+    return devId & 0xf;
+}
+
+void ArrayInfo::setId(int id) const
+{
+    // 1 << (backendId + 3) sets the 4th, 5th or 6th bit of devId to 1
+    // for CPU, CUDA and OpenCL respectively
+    // See ArrayInfo.hpp for more
+    int backendId = detail::getBackend() >> 1; // Convert enums 1, 2, 4 to ints 0, 1, 2
+    const_cast<ArrayInfo *>(this)->setId(id | 1 << (backendId + 3));
+}
+
+void ArrayInfo::setId(int id)
+{
+    // 1 << (backendId + 3) sets the 4th, 5th or 6th bit of devId to 1
+    // for CPU, CUDA and OpenCL respectively
+    // See ArrayInfo.hpp for more
+    int backendId = detail::getBackend() >> 1; // Convert enums 1, 2, 4 to ints 0, 1, 2
+    devId = id | 1 << (backendId + 3);
+}
+
+af_backend ArrayInfo::getBackendId() const
+{
+    // devId >> 3 converts the backend info to 1, 2, 4 which are enums
+    // for CPU, CUDA and OpenCL respectively
+    // See ArrayInfo.hpp for more
+    int backendId = devId >> 3;
+    return (af_backend)backendId;
+}
+
 void ArrayInfo::modStrides(const dim4 &newStrides)
 {
     dim_strides = newStrides;
@@ -133,6 +170,8 @@ bool ArrayInfo::isInteger() const
          || type == u32
          || type == s64
          || type == u64
+         || type == s16
+         || type == u16
          || type == u8);
 }
 
@@ -172,3 +211,46 @@ dim4 getOutDims(const dim4 &ldims, const dim4 &rdims, bool batchMode)
 
     return dim4(4, odims);
 }
+
+using std::vector;
+
+dim4
+toDims(const vector<af_seq>& seqs, const dim4 &parentDims)
+{
+    dim4 outDims(1, 1, 1, 1);
+    for(unsigned i = 0; i < seqs.size(); i++ ) {
+        outDims[i] = af::calcDim(seqs[i], parentDims[i]);
+        if (outDims[i] > parentDims[i])
+            AF_ERROR("Size mismatch between input and output", AF_ERR_SIZE);
+    }
+    return outDims;
+}
+
+dim4
+toOffset(const vector<af_seq>& seqs, const dim4 &parentDims)
+{
+    dim4 outOffsets(0, 0, 0, 0);
+    for(unsigned i = 0; i < seqs.size(); i++ ) {
+        if (seqs[i].step !=0 && seqs[i].begin >= 0) {
+            outOffsets[i] = seqs[i].begin;
+        } else if (seqs[i].begin <= -1) {
+            outOffsets[i] = parentDims[i] + seqs[i].begin;
+        } else {
+            outOffsets[i] = 0;
+        }
+
+        if (outOffsets[i] >= parentDims[i])
+            AF_ERROR("Index out of range", AF_ERR_SIZE);
+    }
+    return outOffsets;
+}
+
+dim4
+toStride(const vector<af_seq>& seqs, const af::dim4 &parentDims)
+{
+    dim4 out(calcStrides(parentDims));
+    for(unsigned i = 0; i < seqs.size(); i++ ) {
+        if  (seqs[i].step != 0) {   out[i] *= seqs[i].step; }
+    }
+    return out;
+}
diff --git a/src/backend/ArrayInfo.hpp b/src/backend/ArrayInfo.hpp
index f6d2663eba..ca6fcd394c 100644
--- a/src/backend/ArrayInfo.hpp
+++ b/src/backend/ArrayInfo.hpp
@@ -14,6 +14,7 @@
 #include <af/dim4.hpp>
 #include <af/device.h>
 #include <vector>
+#include <cstddef>
 
 dim_t
 calcOffset(const af::dim4 &strides, const af::dim4 &offsets);
@@ -30,6 +31,20 @@ af::dim4 getOutDims(const af::dim4 &ldims, const af::dim4 &rdims, bool batchMode
 class ArrayInfo
 {
 private:
+    // The devId variable stores information about the deviceId as well as the backend.
+    // The 4 LSBs (0-3) are used to store the device ID.
+    // The 4th LSB is set to 1 if backend is CPU
+    // The 5th LSB is set to 1 if backend is CUDA
+    // The 6th LSB is set to 1 if backend is OpenCL
+    // This information can be retrieved directly from an af_array by doing
+    //     int* devId = reinterpret_cast<int*>(a); // a is an af_array
+    //     af_backend backendID = *devId >> 3;  // Returns 1, 2, 4 for CPU, CUDA or OpenCL respectively
+    //     int        deviceID  = *devId & 0xf; // Returns devices ID between 0-15
+    // This is possible by doing a static_assert on devId
+    //
+    // This can be changed in the future if the need arises for more devices as this
+    // implementation is internal. Make sure to change the bit shift ops when
+    // such a change is being made
     int             devId;
     af_dtype        type;
     af::dim4        dim_size;
@@ -42,7 +57,16 @@ class ArrayInfo
         dim_size(size),
         dim_offsets(offset),
         dim_strides(stride)
-    { af_init(); }
+    {
+        af_init();
+        setId(id);
+#if __cplusplus > 199711l
+    static_assert(offsetof(ArrayInfo, devId) == 0,
+                  "ArrayInfo::devId must be the first member variable of ArrayInfo. \
+                   devId is used to encode the backend into the integer. \
+                   This is then used in the unified backend to check mismatched arrays.");
+#endif
+    }
 
 #if __cplusplus > 199711L
     //Copy constructors are deprecated if there is a
@@ -55,16 +79,19 @@ class ArrayInfo
 
     const af::dim4& offsets() const     { return dim_offsets;           }
 
-    const af::dim4& strides()    const  { return dim_strides;           }
+    const af::dim4& strides() const     { return dim_strides;           }
 
     size_t elements() const             { return dim_size.elements();   }
     size_t ndims() const                { return dim_size.ndims();      }
     const af::dim4& dims() const        { return dim_size;              }
 
-    int getDevId() const { return devId; }
+    int getDevId() const;
+
+    void setId(int id) const;
+
+    void setId(int id);
 
-    void setId(int id) const { const_cast<ArrayInfo *>(this)->setId(id); }
-    void setId(int id) { devId = id; }
+    af_backend getBackendId() const;
 
     void resetInfo(const af::dim4& dims)
     {
diff --git a/src/backend/cblas.cpp b/src/backend/cblas.cpp
index bea65b963f..4d99d457c2 100644
--- a/src/backend/cblas.cpp
+++ b/src/backend/cblas.cpp
@@ -35,6 +35,7 @@
 #endif
 
 #define ADD_
+#include <cblas.h>
 #include <cblas_f77.h>
 
 static char transChar(CBLAS_TRANSPOSE Trans)
diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp
index 683fc1ad59..5321137cd5 100644
--- a/src/backend/cpu/Array.cpp
+++ b/src/backend/cpu/Array.cpp
@@ -49,7 +49,7 @@ namespace cpu
 
     template<typename T>
     Array<T>::Array(af::dim4 dims, TNJ::Node_ptr n) :
-        info(-1, dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(), data_dims(dims),
         node(n), offset(0), ready(false), owner(true)
     {
@@ -293,4 +293,6 @@ namespace cpu
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp
index 69b943a6e5..2d3beae942 100644
--- a/src/backend/cpu/approx.cpp
+++ b/src/backend/cpu/approx.cpp
@@ -25,7 +25,8 @@ namespace cpu
                   const Ty *in,  const af::dim4 &idims, const dim_t iElems,
                   const Tp *pos, const af::dim4 &pdims,
                   const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const dim_t idx)
+                  const float offGrid, const bool pBatch,
+                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
         {
             return;
         }
@@ -38,32 +39,28 @@ namespace cpu
                   const Ty *in,  const af::dim4 &idims, const dim_t iElems,
                   const Tp *pos, const af::dim4 &pdims,
                   const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const dim_t idx)
+                  const float offGrid, const bool pBatch,
+                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
         {
-            const dim_t pmId = idx;
+            dim_t pmId = idx;
+            if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
 
             const Tp x = pos[pmId];
             bool gFlag = false;
-            if (x < 0 || idims[0] < x+1) {
+            if (x < 0 || idims[0] < x+1) {  // No need to check y
                 gFlag = true;
             }
 
-            for(dim_t idw = 0; idw < odims[3]; idw++) {
-                for(dim_t idz = 0; idz < odims[2]; idz++) {
-                    for(dim_t idy = 0; idy < odims[1]; idy++) {
-                        const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                                            + idy * ostrides[1] + idx;
-                        if(gFlag) {
-                            out[omId] = scalar<Ty>(offGrid);
-                        } else {
-                            dim_t ioff = idw * istrides[3] + idz * istrides[2]
-                                          + idy * istrides[1];
-                            const dim_t iMem = round(x) + ioff;
-
-                            out[omId] = in[iMem];
-                        }
-                    }
-                }
+            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+                             + idy * ostrides[1] + idx;
+            if(gFlag) {
+                out[omId] = scalar<Ty>(offGrid);
+            } else {
+                dim_t ioff = idw * istrides[3] + idz * istrides[2]
+                           + idy * istrides[1];
+                const dim_t iMem = round(x) + ioff;
+
+                out[omId] = in[iMem];
             }
         }
     };
@@ -75,9 +72,11 @@ namespace cpu
                   const Ty *in,  const af::dim4 &idims, const dim_t iElems,
                   const Tp *pos, const af::dim4 &pdims,
                   const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const dim_t idx)
+                  const float offGrid, const bool pBatch,
+                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
         {
-            const dim_t pmId = idx;
+            dim_t pmId = idx;
+            if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
 
             const Tp x = pos[pmId];
             bool gFlag = false;
@@ -85,32 +84,26 @@ namespace cpu
                 gFlag = true;
             }
 
-            const int grid_x = floor(x);  // nearest grid
+            const dim_t grid_x = floor(x);  // nearest grid
             const Tp off_x = x - grid_x; // fractional offset
 
-            for(dim_t idw = 0; idw < odims[3]; idw++) {
-                for(dim_t idz = 0; idz < odims[2]; idz++) {
-                    for(dim_t idy = 0; idy < odims[1]; idy++) {
-                        const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                                            + idy * ostrides[1] + idx;
-                        if(gFlag) {
-                            out[omId] = scalar<Ty>(offGrid);
-                        } else {
-                            dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x;
-
-                            // Check if x and x + 1 are both valid indices
-                            bool cond = (x < idims[0] - 1);
-                            // Compute Left and Right Weighted Values
-                            Ty yl = ((Tp)1.0 - off_x) * in[ioff];
-                            Ty yr = cond ? (off_x) * in[ioff + 1] : scalar<Ty>(0);
-                            Ty yo = yl + yr;
-                            // Compute Weight used
-                            Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x);
-                            // Write final value
-                            out[omId] = (yo / wt);
-                        }
-                    }
-                }
+            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+                             + idy * ostrides[1] + idx;
+            if(gFlag) {
+                out[omId] = scalar<Ty>(offGrid);
+            } else {
+                dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x;
+
+                // Check if x and x + 1 are both valid indices
+                bool cond = (x < idims[0] - 1);
+                // Compute Left and Right Weighted Values
+                Ty yl = ((Tp)1.0 - off_x) * in[ioff];
+                Ty yr = cond ? (off_x) * in[ioff + 1] : scalar<Ty>(0);
+                Ty yo = yl + yr;
+                // Compute Weight used
+                Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x);
+                // Write final value
+                out[omId] = (yo / wt);
             }
         }
     };
@@ -123,9 +116,17 @@ namespace cpu
             const float offGrid)
     {
         approx1_op<Ty, Tp, method> op;
-        for(dim_t x = 0; x < odims[0]; x++) {
-            op(out, odims, oElems, in, idims, iElems, pos, pdims,
-               ostrides, istrides, pstrides, offGrid, x);
+        bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1);
+
+        for(dim_t w = 0; w < odims[3]; w++) {
+            for(dim_t z = 0; z < odims[2]; z++) {
+                for(dim_t y = 0; y < odims[1]; y++) {
+                    for(dim_t x = 0; x < odims[0]; x++) {
+                        op(out, odims, oElems, in, idims, iElems, pos, pdims,
+                           ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w);
+                    }
+                }
+            }
         }
     }
 
@@ -169,7 +170,8 @@ namespace cpu
                   const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
                   const af::dim4 &ostrides, const af::dim4 &istrides,
                   const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const dim_t idx, const dim_t idy)
+                  const float offGrid, const bool pBatch,
+                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
         {
             return;
         }
@@ -183,10 +185,15 @@ namespace cpu
                   const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
                   const af::dim4 &ostrides, const af::dim4 &istrides,
                   const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const dim_t idx, const dim_t idy)
+                  const float offGrid, const bool pBatch,
+                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
         {
-            const dim_t pmId = idy * pstrides[1] + idx;
-            const dim_t qmId = idy * qstrides[1] + idx;
+            dim_t pmId = idy * pstrides[1] + idx;
+            dim_t qmId = idy * qstrides[1] + idx;
+            if(pBatch) {
+                pmId += idw * pstrides[3] + idz * pstrides[2];
+                qmId += idw * qstrides[3] + idz * qstrides[2];
+            }
 
             bool gFlag = false;
             const Tp x = pos[pmId], y = qos[qmId];
@@ -194,20 +201,15 @@ namespace cpu
                 gFlag = true;
             }
 
-            for(dim_t idw = 0; idw < odims[3]; idw++) {
-                for(dim_t idz = 0; idz < odims[2]; idz++) {
-                    const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                                        + idy * ostrides[1] + idx;
-                    if(gFlag) {
-                        out[omId] = scalar<Ty>(offGrid);
-                    } else {
-                        const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
-                        const dim_t imId = idw * istrides[3] +
-                                              idz * istrides[2] +
-                                              grid_y * istrides[1] + grid_x;
-                        out[omId] = in[imId];
-                    }
-                }
+            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+                             + idy * ostrides[1] + idx;
+            if(gFlag) {
+                out[omId] = scalar<Ty>(offGrid);
+            } else {
+                const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
+                const dim_t imId = idw * istrides[3] + idz * istrides[2] +
+                                grid_y * istrides[1] + grid_x;
+                out[omId] = in[imId];
             }
         }
     };
@@ -220,10 +222,15 @@ namespace cpu
                   const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
                   const af::dim4 &ostrides, const af::dim4 &istrides,
                   const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const dim_t idx, const dim_t idy)
+                  const float offGrid, const bool pBatch,
+                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
         {
-            const dim_t pmId = idy * pstrides[1] + idx;
-            const dim_t qmId = idy * qstrides[1] + idx;
+            dim_t pmId = idy * pstrides[1] + idx;
+            dim_t qmId = idy * qstrides[1] + idx;
+            if(pBatch) {
+                pmId += idw * pstrides[3] + idz * pstrides[2];
+                qmId += idw * qstrides[3] + idz * qstrides[2];
+            }
 
             bool gFlag = false;
             const Tp x = pos[pmId], y = qos[qmId];
@@ -231,7 +238,7 @@ namespace cpu
                 gFlag = true;
             }
 
-            const int grid_x = floor(x),   grid_y = floor(y);   // nearest grid
+            const dim_t grid_x = floor(x),   grid_y = floor(y);   // nearest grid
             const Tp off_x  = x - grid_x, off_y  = y - grid_y; // fractional offset
 
             // Check if pVal and pVal + 1 are both valid indices
@@ -247,29 +254,24 @@ namespace cpu
             Tp wt = wt00 + wt10 + wt01 + wt11;
             Ty zero = scalar<Ty>(0);
 
-            for(dim_t idw = 0; idw < odims[3]; idw++) {
-                for(dim_t idz = 0; idz < odims[2]; idz++) {
-                    const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                                        + idy * ostrides[1] + idx;
-                    if(gFlag) {
-                        out[omId] = scalar<Ty>(offGrid);
-                    } else {
-                        dim_t ioff = idw * istrides[3] + idz * istrides[2]
-                                   + grid_y * istrides[1] + grid_x;
-
-                        // Compute Weighted Values
-                        Ty y00 =                    wt00 * in[ioff];
-                        Ty y10 = (condY) ?          wt10 * in[ioff + istrides[1]]     : zero;
-                        Ty y01 = (condX) ?          wt01 * in[ioff + 1]                   : zero;
-                        Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero;
+            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+                             + idy * ostrides[1] + idx;
+            if(gFlag) {
+                out[omId] = scalar<Ty>(offGrid);
+            } else {
+                dim_t ioff = idw * istrides[3] + idz * istrides[2]
+                        + grid_y * istrides[1] + grid_x;
 
-                        Ty yo = y00 + y10 + y01 + y11;
+                // Compute Weighted Values
+                Ty y00 =                    wt00 * in[ioff];
+                Ty y10 = (condY) ?          wt10 * in[ioff + istrides[1]]     : zero;
+                Ty y01 = (condX) ?          wt01 * in[ioff + 1]               : zero;
+                Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero;
 
-                        // Write Final Value
-                        out[omId] = (yo / wt);
+                Ty yo = y00 + y10 + y01 + y11;
 
-                    }
-                }
+                // Write Final Value
+                out[omId] = (yo / wt);
             }
         }
     };
@@ -283,10 +285,16 @@ namespace cpu
             const float offGrid)
     {
         approx2_op<Ty, Tp, method> op;
-        for(dim_t y = 0; y < odims[1]; y++) {
-            for(dim_t x = 0; x < odims[0]; x++) {
-                op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims,
-                    ostrides, istrides, pstrides, qstrides, offGrid, x, y);
+        bool pBatch = !(pdims[2] == 1 && pdims[3] == 1);
+
+        for(dim_t w = 0; w < odims[3]; w++) {
+            for(dim_t z = 0; z < odims[2]; z++) {
+                for(dim_t y = 0; y < odims[1]; y++) {
+                    for(dim_t x = 0; x < odims[0]; x++) {
+                        op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims,
+                           ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w);
+                    }
+                }
             }
         }
     }
@@ -325,12 +333,12 @@ namespace cpu
         return out;
     }
 
-#define INSTANTIATE(Ty, Tp)                                                                     \
+#define INSTANTIATE(Ty, Tp)                                                                    \
     template Array<Ty> approx1<Ty, Tp>(const Array<Ty> &in, const Array<Tp> &pos,              \
-                                        const af_interp_type method, const float offGrid);      \
+                                       const af_interp_type method, const float offGrid);      \
     template Array<Ty> approx2<Ty, Tp>(const Array<Ty> &in, const Array<Tp> &pos0,             \
-                                        const Array<Tp> &pos1, const af_interp_type method,     \
-                                        const float offGrid);                                   \
+                                       const Array<Tp> &pos1, const af_interp_type method,     \
+                                       const float offGrid);                                   \
 
     INSTANTIATE(float  , float )
     INSTANTIATE(double , double)
diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp
index a8ac33ece0..623bd52ac7 100644
--- a/src/backend/cpu/assign.cpp
+++ b/src/backend/cpu/assign.cpp
@@ -124,5 +124,7 @@ INSTANTIATE(intl   )
 INSTANTIATE(int    )
 INSTANTIATE(uchar  )
 INSTANTIATE(char   )
+INSTANTIATE(ushort )
+INSTANTIATE(short  )
 
 }
diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp
index d8ef7c61cb..2d1e4dddff 100644
--- a/src/backend/cpu/bilateral.cpp
+++ b/src/backend/cpu/bilateral.cpp
@@ -107,5 +107,7 @@ INSTANTIATE(char  ,  float)
 INSTANTIATE(int   ,  float)
 INSTANTIATE(uint  ,  float)
 INSTANTIATE(uchar ,  float)
+INSTANTIATE(short ,  float)
+INSTANTIATE(ushort,  float)
 
 }
diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp
index 33670d47cc..77d7daa5cd 100644
--- a/src/backend/cpu/convolve.cpp
+++ b/src/backend/cpu/convolve.cpp
@@ -319,5 +319,9 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(uintl  ,   float)
+INSTANTIATE(intl   ,   float)
 
 }
diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp
index a2bb4ff912..87e4480a36 100644
--- a/src/backend/cpu/copy.cpp
+++ b/src/backend/cpu/copy.cpp
@@ -149,6 +149,8 @@ namespace cpu
     INSTANTIATE(char   )
     INSTANTIATE(intl   )
     INSTANTIATE(uintl  )
+    INSTANTIATE(short  )
+    INSTANTIATE(ushort )
 
 
 #define INSTANTIATE_PAD_ARRAY(SRC_T)                                    \
@@ -158,29 +160,35 @@ namespace cpu
     template Array<cdouble> padArray<SRC_T, cdouble>(Array<SRC_T> const &src, dim4 const &dims, cdouble default_value, double factor); \
     template Array<int    > padArray<SRC_T, int    >(Array<SRC_T> const &src, dim4 const &dims, int     default_value, double factor); \
     template Array<uint   > padArray<SRC_T, uint   >(Array<SRC_T> const &src, dim4 const &dims, uint    default_value, double factor); \
-    template Array<intl    > padArray<SRC_T, intl    >(Array<SRC_T> const &src, dim4 const &dims, intl     default_value, double factor); \
-    template Array<uintl   > padArray<SRC_T, uintl   >(Array<SRC_T> const &src, dim4 const &dims, uintl    default_value, double factor); \
+    template Array<intl   > padArray<SRC_T, intl   >(Array<SRC_T> const &src, dim4 const &dims, intl    default_value, double factor); \
+    template Array<uintl  > padArray<SRC_T, uintl  >(Array<SRC_T> const &src, dim4 const &dims, uintl   default_value, double factor); \
+    template Array<short  > padArray<SRC_T, short  >(Array<SRC_T> const &src, dim4 const &dims, short   default_value, double factor); \
+    template Array<ushort > padArray<SRC_T, ushort >(Array<SRC_T> const &src, dim4 const &dims, ushort  default_value, double factor); \
     template Array<uchar  > padArray<SRC_T, uchar  >(Array<SRC_T> const &src, dim4 const &dims, uchar   default_value, double factor); \
     template Array<char   > padArray<SRC_T, char   >(Array<SRC_T> const &src, dim4 const &dims, char    default_value, double factor); \
-    template void copyArray<SRC_T, float  >(Array<float  > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, double >(Array<double > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, cfloat >(Array<cfloat > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, cdouble>(Array<cdouble> &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, int    >(Array<int    > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, uint   >(Array<uint   > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, intl    >(Array<intl    > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, uintl   >(Array<uintl   > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, uchar  >(Array<uchar  > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, float  >(Array<float  > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, double >(Array<double > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, cfloat >(Array<cfloat > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, cdouble>(Array<cdouble> &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, int    >(Array<int    > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, uint   >(Array<uint   > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, intl   >(Array<intl   > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, uintl  >(Array<uintl  > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, short  >(Array<short  > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, ushort >(Array<ushort > &dst, Array<SRC_T> const &src);  \
+    template void copyArray<SRC_T, uchar  >(Array<uchar  > &dst, Array<SRC_T> const &src);  \
     template void copyArray<SRC_T, char   >(Array<char   > &dst, Array<SRC_T> const &src);
 
     INSTANTIATE_PAD_ARRAY(float )
     INSTANTIATE_PAD_ARRAY(double)
     INSTANTIATE_PAD_ARRAY(int   )
     INSTANTIATE_PAD_ARRAY(uint  )
-    INSTANTIATE_PAD_ARRAY(intl   )
-    INSTANTIATE_PAD_ARRAY(uintl  )
+    INSTANTIATE_PAD_ARRAY(intl  )
+    INSTANTIATE_PAD_ARRAY(uintl )
     INSTANTIATE_PAD_ARRAY(uchar )
     INSTANTIATE_PAD_ARRAY(char  )
+    INSTANTIATE_PAD_ARRAY(ushort)
+    INSTANTIATE_PAD_ARRAY(short )
 
 #define INSTANTIATE_PAD_ARRAY_COMPLEX(SRC_T)                            \
     template Array<cfloat > padArray<SRC_T, cfloat >(Array<SRC_T> const &src, dim4 const &dims, cfloat  default_value, double factor); \
@@ -197,14 +205,16 @@ namespace cpu
         CPU_NOT_SUPPORTED();\
     }
 
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat, double)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat, float)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat, uchar)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat, char)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat, uint)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat, int)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat, intl)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat, uintl)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , double)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , float)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , char)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , int)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , short)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
@@ -213,5 +223,7 @@ namespace cpu
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
+    SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
+    SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
 
 }
diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp
index 2ae69a6901..d949a24437 100644
--- a/src/backend/cpu/diagonal.cpp
+++ b/src/backend/cpu/diagonal.cpp
@@ -86,5 +86,7 @@ namespace cpu
     INSTANTIATE_DIAGONAL(uintl)
     INSTANTIATE_DIAGONAL(char)
     INSTANTIATE_DIAGONAL(uchar)
+    INSTANTIATE_DIAGONAL(short)
+    INSTANTIATE_DIAGONAL(ushort)
 
 }
diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp
index 907c111c0b..063a761baf 100644
--- a/src/backend/cpu/diff.cpp
+++ b/src/backend/cpu/diff.cpp
@@ -120,4 +120,6 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(ushort)
+    INSTANTIATE(short)
 }
diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp
index 929d48fcc2..1c8069c24d 100644
--- a/src/backend/cpu/fast.cpp
+++ b/src/backend/cpu/fast.cpp
@@ -336,5 +336,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp
index bdc5538245..f76f3a0d3f 100644
--- a/src/backend/cpu/fftconvolve.cpp
+++ b/src/backend/cpu/fftconvolve.cpp
@@ -428,5 +428,9 @@ INSTANTIATE(uint  , float,  cfloat,  false, true)
 INSTANTIATE(int   , float,  cfloat,  false, true)
 INSTANTIATE(uchar , float,  cfloat,  false, true)
 INSTANTIATE(char  , float,  cfloat,  false, true)
+INSTANTIATE(uintl , float,  cfloat,  false, true)
+INSTANTIATE(intl  , float,  cfloat,  false, true)
+INSTANTIATE(ushort, float,  cfloat,  false, true)
+INSTANTIATE(short , float,  cfloat,  false, true)
 
 } // namespace cpu
diff --git a/src/backend/cpu/hist_graphics.cpp b/src/backend/cpu/hist_graphics.cpp
index 4c940fb523..21d3fdf941 100644
--- a/src/backend/cpu/hist_graphics.cpp
+++ b/src/backend/cpu/hist_graphics.cpp
@@ -34,6 +34,8 @@ INSTANTIATE(float)
 INSTANTIATE(int)
 INSTANTIATE(uint)
 INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
 
diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp
index 371a8d0407..e382a0ee87 100644
--- a/src/backend/cpu/histogram.cpp
+++ b/src/backend/cpu/histogram.cpp
@@ -60,5 +60,9 @@ INSTANTIATE(char  , uint)
 INSTANTIATE(int   , uint)
 INSTANTIATE(uint  , uint)
 INSTANTIATE(uchar , uint)
+INSTANTIATE(short , uint)
+INSTANTIATE(ushort, uint)
+INSTANTIATE(intl  , uint)
+INSTANTIATE(uintl , uint)
 
 }
diff --git a/src/backend/cpu/homography.cpp b/src/backend/cpu/homography.cpp
new file mode 100644
index 0000000000..d20f0ca00c
--- /dev/null
+++ b/src/backend/cpu/homography.cpp
@@ -0,0 +1,383 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_cpu.hpp>
+#include <handle.hpp>
+#include <homography.hpp>
+#include <arith.hpp>
+#include <ireduce.hpp>
+#include <random.hpp>
+#include <svd.hpp>
+#include <memory.hpp>
+#include <cstring>
+
+#include <cfloat>
+
+using af::dim4;
+
+namespace cpu
+{
+
+template<typename T>
+T sq(T a)
+{
+    return a * a;
+}
+
+#define APTR(Y, X) (A_ptr[(Y) * Adims[0] + (X)])
+
+static const float RANSACConfidence = 0.99f;
+static const float LMEDSConfidence = 0.99f;
+static const float LMEDSOutlierRatio = 0.4f;
+
+template<typename T>
+struct EPS
+{
+    T eps() { return FLT_EPSILON; }
+};
+
+template<>
+struct EPS<float>
+{
+    static float eps() { return FLT_EPSILON; }
+};
+
+template<>
+struct EPS<double>
+{
+    static double eps() { return DBL_EPSILON; }
+};
+
+template<typename T>
+void JacobiSVD(T* S, T* V, int m, int n)
+{
+    const int iterations = 30;
+    T* d = new T[n];
+
+    for (int i = 0; i < n; i++) {
+        T sd = 0;
+        for (int j = 0; j < m; j++) {
+            T t = S[i*m + j];
+            sd += t*t;
+        }
+        d[i] = sd;
+
+        V[i*n + i] = 1;
+    }
+
+    for (int it = 0; it < iterations; it++) {
+        bool converged = false;
+
+        for (int i = 0; i < n-1; i++) {
+            for (int j = i+1; j < n; j++) {
+                T* Si = S + i*m;
+                T* Sj = S + j*m;
+                T* Vi = V + i*n;
+                T* Vj = V + j*n;
+
+                T p = (T)0;
+                for (int k = 0; k < m; k++)
+                    p += Si[k]*Sj[k];
+
+                if (std::abs(p) <= m*EPS<T>::eps()*std::sqrt(d[i]*d[j]))
+                    continue;
+
+                T y = d[i] - d[j];
+                T r = hypot(p*2, y);
+                T r2 = r*2;
+                T c, s;
+                if (y >= 0) {
+                    c = std::sqrt((r + y) / r2);
+                    s = p / (r2*c);
+                }
+                else {
+                    s = std::sqrt((r - y) / r2);
+                    c = p / (r2*s);
+                }
+
+                T a = 0, b = 0;
+                for (int k = 0; k < m; k++) {
+                    T t0 = c*Si[k] + s*Sj[k];
+                    T t1 = c*Sj[k] - s*Si[k];
+                    Si[k] = t0;
+                    Sj[k] = t1;
+
+                    a += t0*t0;
+                    b += t1*t1;
+                }
+                d[i] = a;
+                d[j] = b;
+
+                for (int l = 0; l < n; l++) {
+                    T t0 = Vi[l] * c + Vj[l] * s;
+                    T t1 = Vj[l] * c - Vi[l] * s;
+
+                    Vi[l] = t0;
+                    Vj[l] = t1;
+                }
+
+                converged = true;
+            }
+            if (!converged)
+                break;
+        }
+    }
+
+    delete[] d;
+}
+
+unsigned updateIterations(float inlier_ratio, unsigned iter)
+{
+    float w = std::min(std::max(inlier_ratio, 0.0f), 1.0f);
+    float wn = pow(1 - w, 4.f);
+
+    float d = 1.f - wn;
+    if (d < FLT_MIN)
+        return 0;
+
+    d = log(d);
+
+    float p = std::min(std::max(RANSACConfidence, 0.0f), 1.0f);
+    float n = log(1.f - p);
+
+    return n <= d*iter ? iter : (unsigned)round(n/d);
+}
+
+template<typename T>
+int computeHomography(T* H_ptr,
+                      const float* rnd_ptr,
+                      const float* x_src_ptr,
+                      const float* y_src_ptr,
+                      const float* x_dst_ptr,
+                      const float* y_dst_ptr)
+{
+    if ((unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[1] || (unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[2] ||
+        (unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[3] || (unsigned)rnd_ptr[1] == (unsigned)rnd_ptr[2] ||
+        (unsigned)rnd_ptr[1] == (unsigned)rnd_ptr[3] || (unsigned)rnd_ptr[2] == (unsigned)rnd_ptr[3])
+        return 1;
+
+    float src_pt_x[4], src_pt_y[4], dst_pt_x[4], dst_pt_y[4];
+    for (unsigned j = 0; j < 4; j++) {
+        src_pt_x[j] = x_src_ptr[(unsigned)rnd_ptr[j]];
+        src_pt_y[j] = y_src_ptr[(unsigned)rnd_ptr[j]];
+        dst_pt_x[j] = x_dst_ptr[(unsigned)rnd_ptr[j]];
+        dst_pt_y[j] = y_dst_ptr[(unsigned)rnd_ptr[j]];
+    }
+
+    float x_src_mean = (src_pt_x[0] + src_pt_x[1] + src_pt_x[2] + src_pt_x[3]) / 4.f;
+    float y_src_mean = (src_pt_y[0] + src_pt_y[1] + src_pt_y[2] + src_pt_y[3]) / 4.f;
+    float x_dst_mean = (dst_pt_x[0] + dst_pt_x[1] + dst_pt_x[2] + dst_pt_x[3]) / 4.f;
+    float y_dst_mean = (dst_pt_y[0] + dst_pt_y[1] + dst_pt_y[2] + dst_pt_y[3]) / 4.f;
+
+    float src_var = 0.0f, dst_var = 0.0f;
+    for (unsigned j = 0; j < 4; j++) {
+        src_var += sq(src_pt_x[j] - x_src_mean) + sq(src_pt_y[j] - y_src_mean);
+        dst_var += sq(dst_pt_x[j] - x_dst_mean) + sq(dst_pt_y[j] - y_dst_mean);
+    }
+
+    src_var /= 4.f;
+    dst_var /= 4.f;
+
+    float src_scale = sqrt(2.0f) / sqrt(src_var);
+    float dst_scale = sqrt(2.0f) / sqrt(dst_var);
+
+    Array<T> A = createValueArray<T>(af::dim4(9, 9), (T)0);
+    af::dim4 Adims = A.dims();
+    T* A_ptr = A.get();
+
+    for (unsigned j = 0; j < 4; j++) {
+        float srcx = (src_pt_x[j] - x_src_mean) * src_scale;
+        float srcy = (src_pt_y[j] - y_src_mean) * src_scale;
+        float dstx = (dst_pt_x[j] - x_dst_mean) * dst_scale;
+        float dsty = (dst_pt_y[j] - y_dst_mean) * dst_scale;
+
+        APTR(3, j*2) = -srcx;
+        APTR(4, j*2) = -srcy;
+        APTR(5, j*2) = -1.0f;
+        APTR(6, j*2) = dsty*srcx;
+        APTR(7, j*2) = dsty*srcy;
+        APTR(8, j*2) = dsty;
+
+        APTR(0, j*2+1) = srcx;
+        APTR(1, j*2+1) = srcy;
+        APTR(2, j*2+1) = 1.0f;
+        APTR(6, j*2+1) = -dstx*srcx;
+        APTR(7, j*2+1) = -dstx*srcy;
+        APTR(8, j*2+1) = -dstx;
+    }
+
+    Array<T> V = createValueArray<T>(af::dim4(Adims[1], Adims[1]), (T)0);
+    JacobiSVD<T>(A.get(), V.get(), 9, 9);
+
+    af::dim4 Vdims = V.dims();
+    T* V_ptr = V.get();
+
+    std::vector<T> vH;
+    for (unsigned j = 0; j < 9; j++)
+        vH.push_back(V_ptr[8 * Vdims[0] + j]);
+
+    H_ptr[0] = src_scale*x_dst_mean*vH[6] + src_scale*vH[0]/dst_scale;
+    H_ptr[1] = src_scale*x_dst_mean*vH[7] + src_scale*vH[1]/dst_scale;
+    H_ptr[2] = x_dst_mean*(vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6]) +
+                          (vH[2] - src_scale*y_src_mean*vH[1] - src_scale*x_src_mean*vH[0])/dst_scale;
+
+    H_ptr[3] = src_scale*y_dst_mean*vH[6] + src_scale*vH[3]/dst_scale;
+    H_ptr[4] = src_scale*y_dst_mean*vH[7] + src_scale*vH[4]/dst_scale;
+    H_ptr[5] = y_dst_mean*(vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6]) +
+                          (vH[5] - src_scale*y_src_mean*vH[4] - src_scale*x_src_mean*vH[3])/dst_scale;
+
+    H_ptr[6] = src_scale*vH[6];
+    H_ptr[7] = src_scale*vH[7];
+    H_ptr[8] = vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6];
+
+    return 0;
+}
+
+// LMedS: http://research.microsoft.com/en-us/um/people/zhang/INRIA/Publis/Tutorial-Estim/node25.html
+template<typename T>
+int findBestHomography(Array<T> &bestH,
+                       const Array<float> &x_src,
+                       const Array<float> &y_src,
+                       const Array<float> &x_dst,
+                       const Array<float> &y_dst,
+                       const Array<float> &rnd,
+                       const unsigned iterations,
+                       const unsigned nsamples,
+                       const float inlier_thr,
+                       const af_homography_type htype)
+{
+    const float* x_src_ptr = x_src.get();
+    const float* y_src_ptr = y_src.get();
+    const float* x_dst_ptr = x_dst.get();
+    const float* y_dst_ptr = y_dst.get();
+
+    Array<T> H = createValueArray<T>(af::dim4(9, iterations), (T)0);
+
+    const af::dim4 rdims = rnd.dims();
+    const af::dim4 Hdims = H.dims();
+
+    unsigned iter = iterations;
+    unsigned bestIdx = 0;
+    unsigned bestInliers = 0;
+    float minMedian = FLT_MAX;
+
+    for (unsigned i = 0; i < iter; i++) {
+        const unsigned Hidx = Hdims[0] * i;
+        T* H_ptr = H.get() + Hidx;
+
+        const unsigned ridx = rdims[0] * i;
+        const float* rnd_ptr = rnd.get() + ridx;
+
+        if (computeHomography<T>(H_ptr, rnd_ptr, x_src_ptr, y_src_ptr,
+                                 x_dst_ptr, y_dst_ptr))
+            continue;
+
+        if (htype == AF_HOMOGRAPHY_RANSAC) {
+            unsigned inliers_count = 0;
+            for (unsigned j = 0; j < nsamples; j++) {
+                float z =  H_ptr[6]*x_src_ptr[j] + H_ptr[7]*y_src_ptr[j] + H_ptr[8];
+                float x = (H_ptr[0]*x_src_ptr[j] + H_ptr[1]*y_src_ptr[j] + H_ptr[2]) / z;
+                float y = (H_ptr[3]*x_src_ptr[j] + H_ptr[4]*y_src_ptr[j] + H_ptr[5]) / z;
+
+                float dist = sq(x_dst_ptr[j] - x) + sq(y_dst_ptr[j] - y);
+                if (dist < (inlier_thr*inlier_thr))
+                    inliers_count++;
+            }
+            iter = updateIterations((nsamples - inliers_count) / (float)nsamples, iter);
+            if (inliers_count > bestInliers) {
+                bestIdx = i;
+                bestInliers = inliers_count;
+            }
+        }
+        else if (htype == AF_HOMOGRAPHY_LMEDS) {
+            std::vector<float> err(nsamples);
+            for (unsigned j = 0; j < nsamples; j++) {
+                float z =  H_ptr[6]*x_src_ptr[j] + H_ptr[7]*y_src_ptr[j] + H_ptr[8];
+                float x = (H_ptr[0]*x_src_ptr[j] + H_ptr[1]*y_src_ptr[j] + H_ptr[2]) / z;
+                float y = (H_ptr[3]*x_src_ptr[j] + H_ptr[4]*y_src_ptr[j] + H_ptr[5]) / z;
+
+                float dist = sq(x_dst_ptr[j] - x) + sq(y_dst_ptr[j] - y);
+                err[j] = sqrt(dist);
+            }
+
+            std::stable_sort(err.begin(), err.end());
+
+            float median = err[nsamples / 2];
+            if (nsamples % 2 == 0)
+                median = (median + err[nsamples / 2 - 1]) * 0.5f;
+
+            if (median < minMedian && median > FLT_EPSILON) {
+                minMedian = median;
+                bestIdx = i;
+            }
+
+        }
+    }
+
+    memcpy(bestH.get(), H.get() + bestIdx*9, 9 * sizeof(T));
+
+    if (htype == AF_HOMOGRAPHY_LMEDS) {
+        float sigma = std::max(1.4826f * (1 + 5.f/(nsamples - 4)) * (float)sqrt(minMedian), 1e-6f);
+        float dist_thr = sq(2.5f * sigma);
+        T* bestH_ptr = bestH.get();
+
+        for (unsigned j = 0; j < nsamples; j++) {
+            float z =  bestH_ptr[6]*x_src_ptr[j] + bestH_ptr[7]*y_src_ptr[j] + bestH_ptr[8];
+            float x = (bestH_ptr[0]*x_src_ptr[j] + bestH_ptr[1]*y_src_ptr[j] + bestH_ptr[2]) / z;
+            float y = (bestH_ptr[3]*x_src_ptr[j] + bestH_ptr[4]*y_src_ptr[j] + bestH_ptr[5]) / z;
+
+            float dist = sq(x_dst_ptr[j] - x) + sq(y_dst_ptr[j] - y);
+            if (dist <= dist_thr)
+                bestInliers++;
+        }
+    }
+
+    return bestInliers;
+}
+
+template<typename T>
+int homography(Array<T> &bestH,
+               const Array<float> &x_src,
+               const Array<float> &y_src,
+               const Array<float> &x_dst,
+               const Array<float> &y_dst,
+               const af_homography_type htype,
+               const float inlier_thr,
+               const unsigned iterations)
+{
+    const af::dim4 idims = x_src.dims();
+    const unsigned nsamples = idims[0];
+
+    unsigned iter = iterations;
+    if (htype == AF_HOMOGRAPHY_LMEDS)
+        iter = std::min(iter, (unsigned)(log(1.f - LMEDSConfidence) / log(1.f - pow(1.f - LMEDSOutlierRatio, 4.f))));
+
+    af::dim4 rdims(4, iter);
+    Array<float> frnd = randu<float>(rdims);
+    Array<float> fctr = createValueArray<float>(rdims, (float)nsamples);
+    Array<float> rnd = arithOp<float, af_mul_t>(frnd, fctr, rdims);
+
+    return findBestHomography<T>(bestH, x_src, y_src, x_dst, y_dst, rnd, iter, nsamples, inlier_thr, htype);
+}
+
+#define INSTANTIATE(T)                                                                  \
+    template int homography<T>(Array<T> &bestH,                                         \
+                               const Array<float> &x_src, const Array<float> &y_src,    \
+                               const Array<float> &x_dst, const Array<float> &y_dst,    \
+                               const af_homography_type htype, const float inlier_thr,  \
+                               const unsigned iterations);
+
+INSTANTIATE(float )
+INSTANTIATE(double)
+
+}
diff --git a/src/backend/cpu/homography.hpp b/src/backend/cpu/homography.hpp
new file mode 100644
index 0000000000..7b14d13a73
--- /dev/null
+++ b/src/backend/cpu/homography.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cpu
+{
+
+template<typename T>
+int homography(Array<T> &H,
+               const Array<float> &x_src, const Array<float> &y_src,
+               const Array<float> &x_dst, const Array<float> &y_dst,
+               const af_homography_type htype, const float inlier_thr,
+               const unsigned iterations);
+
+}
diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp
index 3112991406..2973ae4409 100644
--- a/src/backend/cpu/identity.cpp
+++ b/src/backend/cpu/identity.cpp
@@ -42,10 +42,12 @@ namespace cpu
     INSTANTIATE_IDENTITY(cfloat)
     INSTANTIATE_IDENTITY(cdouble)
     INSTANTIATE_IDENTITY(int)
+    INSTANTIATE_IDENTITY(uint)
     INSTANTIATE_IDENTITY(intl)
     INSTANTIATE_IDENTITY(uintl)
-    INSTANTIATE_IDENTITY(uint)
     INSTANTIATE_IDENTITY(char)
     INSTANTIATE_IDENTITY(uchar)
+    INSTANTIATE_IDENTITY(short)
+    INSTANTIATE_IDENTITY(ushort)
 
 }
diff --git a/src/backend/cpu/image.cpp b/src/backend/cpu/image.cpp
index 8b211fe84d..947afa2351 100644
--- a/src/backend/cpu/image.cpp
+++ b/src/backend/cpu/image.cpp
@@ -46,6 +46,8 @@ namespace cpu
     INSTANTIATE(uint)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(ushort)
+    INSTANTIATE(short)
 }
 
 #endif  // WITH_GRAPHICS
diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp
index 162e67fb46..e6d3daba4e 100644
--- a/src/backend/cpu/index.cpp
+++ b/src/backend/cpu/index.cpp
@@ -122,5 +122,7 @@ INSTANTIATE(intl   )
 INSTANTIATE(int    )
 INSTANTIATE(uchar  )
 INSTANTIATE(char   )
+INSTANTIATE(ushort )
+INSTANTIATE(short  )
 
 }
diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp
index 668500f697..47bcb924e4 100644
--- a/src/backend/cpu/iota.cpp
+++ b/src/backend/cpu/iota.cpp
@@ -67,4 +67,6 @@ namespace cpu
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp
index 199a0befb3..2928af9620 100644
--- a/src/backend/cpu/ireduce.cpp
+++ b/src/backend/cpu/ireduce.cpp
@@ -185,6 +185,8 @@ namespace cpu
     INSTANTIATE(af_min_t, uintl  )
     INSTANTIATE(af_min_t, char   )
     INSTANTIATE(af_min_t, uchar  )
+    INSTANTIATE(af_min_t, short  )
+    INSTANTIATE(af_min_t, ushort )
 
     //max
     INSTANTIATE(af_max_t, float  )
@@ -197,4 +199,6 @@ namespace cpu
     INSTANTIATE(af_max_t, uintl  )
     INSTANTIATE(af_max_t, char   )
     INSTANTIATE(af_max_t, uchar  )
+    INSTANTIATE(af_max_t, short  )
+    INSTANTIATE(af_max_t, ushort )
 }
diff --git a/src/backend/cpu/join.cpp b/src/backend/cpu/join.cpp
index eeb34a01c7..78d2a51ab4 100644
--- a/src/backend/cpu/join.cpp
+++ b/src/backend/cpu/join.cpp
@@ -226,6 +226,8 @@ namespace cpu
     INSTANTIATE(uintl,   uintl)
     INSTANTIATE(uchar,   uchar)
     INSTANTIATE(char,    char)
+    INSTANTIATE(ushort,  ushort)
+    INSTANTIATE(short,   short)
 
 #undef INSTANTIATE
 
@@ -242,6 +244,8 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(ushort)
+    INSTANTIATE(short)
 
 #undef INSTANTIATE
 }
diff --git a/src/backend/cpu/lookup.cpp b/src/backend/cpu/lookup.cpp
index f3e18bd4d6..128cc02823 100644
--- a/src/backend/cpu/lookup.cpp
+++ b/src/backend/cpu/lookup.cpp
@@ -80,6 +80,10 @@ Array<in_t> lookup(const Array<in_t> &input, const Array<idx_t> &indices, const
     template Array<T>  lookup<T, double  >(const Array<T> &input, const Array<double  > &indices, const unsigned dim); \
     template Array<T>  lookup<T, int     >(const Array<T> &input, const Array<int     > &indices, const unsigned dim); \
     template Array<T>  lookup<T, unsigned>(const Array<T> &input, const Array<unsigned> &indices, const unsigned dim); \
+    template Array<T>  lookup<T, short   >(const Array<T> &input, const Array<short   > &indices, const unsigned dim); \
+    template Array<T>  lookup<T, ushort  >(const Array<T> &input, const Array<ushort  > &indices, const unsigned dim); \
+    template Array<T>  lookup<T, intl    >(const Array<T> &input, const Array<intl    > &indices, const unsigned dim); \
+    template Array<T>  lookup<T, uintl   >(const Array<T> &input, const Array<uintl   > &indices, const unsigned dim); \
     template Array<T>  lookup<T, uchar   >(const Array<T> &input, const Array<uchar   > &indices, const unsigned dim);
 
 INSTANTIATE(float   );
@@ -92,5 +96,7 @@ INSTANTIATE(intl    );
 INSTANTIATE(uintl   );
 INSTANTIATE(uchar   );
 INSTANTIATE(char    );
+INSTANTIATE(ushort  );
+INSTANTIATE(short   );
 
 }
diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp
index b026529dba..4d930145d5 100644
--- a/src/backend/cpu/match_template.cpp
+++ b/src/backend/cpu/match_template.cpp
@@ -159,5 +159,7 @@ INSTANTIATE(char  ,  float)
 INSTANTIATE(int   ,  float)
 INSTANTIATE(uint  ,  float)
 INSTANTIATE(uchar ,  float)
+INSTANTIATE(short ,  float)
+INSTANTIATE(ushort,  float)
 
 }
diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp
index 86e1d6eea2..b52eaf9387 100644
--- a/src/backend/cpu/meanshift.cpp
+++ b/src/backend/cpu/meanshift.cpp
@@ -155,5 +155,9 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
+INSTANTIATE(intl  )
+INSTANTIATE(uintl )
 
 }
diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp
index 1047a52723..3ded3c045a 100644
--- a/src/backend/cpu/medfilt.cpp
+++ b/src/backend/cpu/medfilt.cpp
@@ -145,5 +145,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(ushort)
+INSTANTIATE(short )
 
 }
diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp
index c2a1441e27..ac10643c9b 100644
--- a/src/backend/cpu/memory.cpp
+++ b/src/backend/cpu/memory.cpp
@@ -241,4 +241,6 @@ namespace cpu
     INSTANTIATE(uchar)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(ushort)
+    INSTANTIATE(short )
 }
diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp
index ff7b49d0de..eb2e1de339 100644
--- a/src/backend/cpu/morph.cpp
+++ b/src/backend/cpu/morph.cpp
@@ -168,5 +168,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(ushort)
+INSTANTIATE(short )
 
 }
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp
index f706769282..79d41516e3 100644
--- a/src/backend/cpu/nearest_neighbour.cpp
+++ b/src/backend/cpu/nearest_neighbour.cpp
@@ -80,6 +80,15 @@ struct dist_op<uchar, To, AF_SHD>
     }
 };
 
+template<typename To>
+struct dist_op<ushort, To, AF_SHD>
+{
+    To operator()(ushort v1, ushort v2)
+    {
+        return __builtin_popcount(v1 ^ v2);
+    }
+};
+
 template<typename T, typename To, af_match_type dist_type>
 void nearest_neighbour_(Array<uint>& idx, Array<To>& dist,
                         const Array<T>& query, const Array<T>& train,
@@ -169,7 +178,9 @@ INSTANTIATE(uint  , uint)
 INSTANTIATE(intl  , intl)
 INSTANTIATE(uintl , uintl)
 INSTANTIATE(uchar , uint)
+INSTANTIATE(ushort, uint)
+INSTANTIATE(short , int)
 
-INSTANTIATE(uintl, uint)    // For Hamming
+INSTANTIATE(uintl , uint)    // For Hamming
 
 }
diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp
index ac8ec54712..fc782eab76 100644
--- a/src/backend/cpu/platform.cpp
+++ b/src/backend/cpu/platform.cpp
@@ -172,6 +172,11 @@ CPUInfo::CPUInfo()
 namespace cpu
 {
 
+int getBackend()
+{
+    return AF_BACKEND_CPU;
+}
+
 static const std::string get_system(void)
 {
     std::string arch = (sizeof(void *) == 4) ? "32-bit " : "64-bit ";
diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp
index e899837b8c..2e52cd13a6 100644
--- a/src/backend/cpu/platform.hpp
+++ b/src/backend/cpu/platform.hpp
@@ -10,6 +10,8 @@
 #include <string>
 
 namespace cpu {
+    int getBackend();
+
     std::string getInfo();
 
     bool isDoubleSupported(int device);
diff --git a/src/backend/cpu/plot.cpp b/src/backend/cpu/plot.cpp
index 68c4300210..9de1993f2d 100644
--- a/src/backend/cpu/plot.cpp
+++ b/src/backend/cpu/plot.cpp
@@ -41,6 +41,8 @@ namespace cpu
     INSTANTIATE(int)
     INSTANTIATE(uint)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
 
 #endif  // WITH_GRAPHICS
diff --git a/src/backend/cpu/plot3.cpp b/src/backend/cpu/plot3.cpp
new file mode 100644
index 0000000000..c0e26aaa34
--- /dev/null
+++ b/src/backend/cpu/plot3.cpp
@@ -0,0 +1,48 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_GRAPHICS)
+
+#include <Array.hpp>
+#include <plot3.hpp>
+#include <err_cpu.hpp>
+#include <stdexcept>
+#include <graphics_common.hpp>
+#include <reduce.hpp>
+#include <memory.hpp>
+
+using af::dim4;
+
+namespace cpu
+{
+    template<typename T>
+    void copy_plot3(const Array<T> &P, fg::Plot3* plot3)
+    {
+        CheckGL("Before CopyArrayToVBO");
+
+        glBindBuffer(GL_ARRAY_BUFFER, plot3->vbo());
+        glBufferSubData(GL_ARRAY_BUFFER, 0, plot3->size(), P.get());
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+        CheckGL("In CopyArrayToVBO");
+    }
+
+    #define INSTANTIATE(T)  \
+        template void copy_plot3<T>(const Array<T> &P, fg::Plot3* plot3);
+
+    INSTANTIATE(float)
+    INSTANTIATE(double)
+    INSTANTIATE(int)
+    INSTANTIATE(uint)
+    INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+}
+
+#endif  // WITH_GRAPHICS
diff --git a/src/backend/cpu/plot3.hpp b/src/backend/cpu/plot3.hpp
new file mode 100644
index 0000000000..1e5c97ab0a
--- /dev/null
+++ b/src/backend/cpu/plot3.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <Array.hpp>
+#include <graphics_common.hpp>
+
+namespace cpu
+{
+    template<typename T>
+    void copy_plot3(const Array<T> &P, fg::Plot3* plot3);
+}
+
+#endif
+
diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp
index 4c91b96fb1..ab4230e682 100644
--- a/src/backend/cpu/random.cpp
+++ b/src/backend/cpu/random.cpp
@@ -133,6 +133,8 @@ INSTANTIATE_UNIFORM(uint)
 INSTANTIATE_UNIFORM(intl)
 INSTANTIATE_UNIFORM(uintl)
 INSTANTIATE_UNIFORM(uchar)
+INSTANTIATE_UNIFORM(short)
+INSTANTIATE_UNIFORM(ushort)
 
 #define INSTANTIATE_NORMAL(T)                              \
     template Array<T>  randn<T>(const af::dim4 &dims);
diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp
index f0c8de11f3..eabf3a1ee1 100644
--- a/src/backend/cpu/range.cpp
+++ b/src/backend/cpu/range.cpp
@@ -82,4 +82,6 @@ namespace cpu
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
+    INSTANTIATE(ushort)
+    INSTANTIATE(short)
 }
diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp
index 5724508be6..a38d06118c 100644
--- a/src/backend/cpu/reduce.cpp
+++ b/src/backend/cpu/reduce.cpp
@@ -18,6 +18,20 @@
 
 using af::dim4;
 
+template<>
+struct Binary<cdouble, af_add_t>
+{
+    cdouble init()
+    {
+        return cdouble(0,0);
+    }
+
+    cdouble operator()(cdouble lhs, cdouble rhs)
+    {
+        return cdouble(real(lhs)+real(rhs), imag(lhs)+imag(rhs));
+    }
+};
+
 namespace cpu
 {
     template<af_op_t op, typename Ti, typename To, int D>
@@ -139,6 +153,8 @@ namespace cpu
     INSTANTIATE(af_min_t, uintl  , uintl  )
     INSTANTIATE(af_min_t, char   , char   )
     INSTANTIATE(af_min_t, uchar  , uchar  )
+    INSTANTIATE(af_min_t, short  , short  )
+    INSTANTIATE(af_min_t, ushort , ushort )
 
     //max
     INSTANTIATE(af_max_t, float  , float  )
@@ -151,6 +167,8 @@ namespace cpu
     INSTANTIATE(af_max_t, uintl  , uintl  )
     INSTANTIATE(af_max_t, char   , char   )
     INSTANTIATE(af_max_t, uchar  , uchar  )
+    INSTANTIATE(af_max_t, short  , short  )
+    INSTANTIATE(af_max_t, ushort , ushort )
 
     //sum
     INSTANTIATE(af_add_t, float  , float  )
@@ -158,13 +176,23 @@ namespace cpu
     INSTANTIATE(af_add_t, cfloat , cfloat )
     INSTANTIATE(af_add_t, cdouble, cdouble)
     INSTANTIATE(af_add_t, int    , int    )
+    INSTANTIATE(af_add_t, int    , float  )
     INSTANTIATE(af_add_t, uint   , uint   )
+    INSTANTIATE(af_add_t, uint   , float  )
     INSTANTIATE(af_add_t, intl   , intl   )
+    INSTANTIATE(af_add_t, intl   , double )
     INSTANTIATE(af_add_t, uintl  , uintl  )
+    INSTANTIATE(af_add_t, uintl  , double )
     INSTANTIATE(af_add_t, char   , int    )
+    INSTANTIATE(af_add_t, char   , float  )
     INSTANTIATE(af_add_t, uchar  , uint   )
+    INSTANTIATE(af_add_t, uchar  , float  )
+    INSTANTIATE(af_add_t, short  , int    )
+    INSTANTIATE(af_add_t, short  , float  )
+    INSTANTIATE(af_add_t, ushort , uint   )
+    INSTANTIATE(af_add_t, ushort , float  )
 
-    //sum
+    //mul
     INSTANTIATE(af_mul_t, float  , float  )
     INSTANTIATE(af_mul_t, double , double )
     INSTANTIATE(af_mul_t, cfloat , cfloat )
@@ -175,6 +203,8 @@ namespace cpu
     INSTANTIATE(af_mul_t, uintl  , uintl  )
     INSTANTIATE(af_mul_t, char   , int    )
     INSTANTIATE(af_mul_t, uchar  , uint   )
+    INSTANTIATE(af_mul_t, short  , int    )
+    INSTANTIATE(af_mul_t, ushort , uint   )
 
     // count
     INSTANTIATE(af_notzero_t, float  , uint)
@@ -187,6 +217,8 @@ namespace cpu
     INSTANTIATE(af_notzero_t, uintl  , uint)
     INSTANTIATE(af_notzero_t, char   , uint)
     INSTANTIATE(af_notzero_t, uchar  , uint)
+    INSTANTIATE(af_notzero_t, short  , uint)
+    INSTANTIATE(af_notzero_t, ushort , uint)
 
     //anytrue
     INSTANTIATE(af_or_t, float  , char)
@@ -199,6 +231,8 @@ namespace cpu
     INSTANTIATE(af_or_t, uintl  , char)
     INSTANTIATE(af_or_t, char   , char)
     INSTANTIATE(af_or_t, uchar  , char)
+    INSTANTIATE(af_or_t, short  , char)
+    INSTANTIATE(af_or_t, ushort , char)
 
     //alltrue
     INSTANTIATE(af_and_t, float  , char)
@@ -211,4 +245,6 @@ namespace cpu
     INSTANTIATE(af_and_t, uintl  , char)
     INSTANTIATE(af_and_t, char   , char)
     INSTANTIATE(af_and_t, uchar  , char)
+    INSTANTIATE(af_and_t, short  , char)
+    INSTANTIATE(af_and_t, ushort , char)
 }
diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp
index b1377689a5..b753fb5547 100644
--- a/src/backend/cpu/regions.cpp
+++ b/src/backend/cpu/regions.cpp
@@ -208,5 +208,7 @@ INSTANTIATE(float )
 INSTANTIATE(double)
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp
index 42da24e435..a9824a4444 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/reorder.cpp
@@ -70,6 +70,8 @@ namespace cpu
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 
 }
diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp
index a4ba43f0ad..8c4da58934 100644
--- a/src/backend/cpu/resize.cpp
+++ b/src/backend/cpu/resize.cpp
@@ -217,4 +217,6 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp
index 2293ee2037..a4af64b669 100644
--- a/src/backend/cpu/rotate.cpp
+++ b/src/backend/cpu/rotate.cpp
@@ -115,4 +115,6 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp
index 9cd4163ec3..2bdda210a2 100644
--- a/src/backend/cpu/scan.cpp
+++ b/src/backend/cpu/scan.cpp
@@ -108,6 +108,8 @@ namespace cpu
     INSTANTIATE(af_add_t, uintl  , uintl  )
     INSTANTIATE(af_add_t, char   , int    )
     INSTANTIATE(af_add_t, uchar  , uint   )
+    INSTANTIATE(af_add_t, short  , int    )
+    INSTANTIATE(af_add_t, ushort , uint   )
     INSTANTIATE(af_notzero_t, char  , uint   )
 
 }
diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp
index 286e884898..7b2cc81735 100644
--- a/src/backend/cpu/select.cpp
+++ b/src/backend/cpu/select.cpp
@@ -140,4 +140,6 @@ namespace cpu
     INSTANTIATE(uintl  )
     INSTANTIATE(char   )
     INSTANTIATE(uchar  )
+    INSTANTIATE(short  )
+    INSTANTIATE(ushort )
 }
diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp
index 3a8239ed1d..3215e6d5c2 100644
--- a/src/backend/cpu/set.cpp
+++ b/src/backend/cpu/set.cpp
@@ -115,4 +115,8 @@ namespace cpu
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 }
diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp
index eff5c0923c..05cac4c678 100644
--- a/src/backend/cpu/shift.cpp
+++ b/src/backend/cpu/shift.cpp
@@ -82,5 +82,7 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/sift.cpp b/src/backend/cpu/sift.cpp
index 1f38ffffb4..70bb11d1ae 100644
--- a/src/backend/cpu/sift.cpp
+++ b/src/backend/cpu/sift.cpp
@@ -36,14 +36,18 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
               const Array<T>& in, const unsigned n_layers,
               const float contrast_thr, const float edge_thr,
               const float init_sigma, const bool double_input,
-              const float img_scale, const float feature_ratio)
+              const float img_scale, const float feature_ratio,
+              const bool compute_GLOH)
 {
 #ifdef AF_BUILD_SIFT
     return sift_impl<T, convAccT>(x, y, score, ori, size, desc, in, n_layers,
                                   contrast_thr, edge_thr, init_sigma, double_input,
-                                  img_scale, feature_ratio);
+                                  img_scale, feature_ratio, compute_GLOH);
 #else
-    AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AFF_ERR_NONFREE);
+    if (compute_GLOH)
+        AF_ERROR("ArrayFire was not built with nonfree support, GLOH disabled\n", AF_ERR_NONFREE);
+    else
+        AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AF_ERR_NONFREE);
 #endif
 }
 
@@ -54,7 +58,8 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
                                         const Array<T>& in, const unsigned n_layers,        \
                                         const float contrast_thr, const float edge_thr,     \
                                         const float init_sigma, const bool double_input,    \
-                                        const float img_scale, const float feature_ratio);
+                                        const float img_scale, const float feature_ratio,   \
+                                        const bool compute_GLOH);
 
 INSTANTIATE(float , float )
 INSTANTIATE(double, double)
diff --git a/src/backend/cpu/sift.hpp b/src/backend/cpu/sift.hpp
index 044b4e0fb2..1ceea4b8c7 100644
--- a/src/backend/cpu/sift.hpp
+++ b/src/backend/cpu/sift.hpp
@@ -21,6 +21,7 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
               const Array<T>& in, const unsigned n_layers,
               const float contrast_thr, const float edge_thr,
               const float init_sigma, const bool double_input,
-              const float img_scale, const float feature_ratio);
+              const float img_scale, const float feature_ratio,
+              const bool compute_GLOH);
 
 }
diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp
index 5ba9fe4db1..514a134c7d 100644
--- a/src/backend/cpu/sift_nonfree.hpp
+++ b/src/backend/cpu/sift_nonfree.hpp
@@ -117,6 +117,18 @@ namespace cpu
 // factor used to convert floating-point descriptor to unsigned char
     static const float IntDescrFctr = 512.f;
 
+// Number of GLOH bins in radial direction
+    static const unsigned GLOHRadialBins = 3;
+
+// Radiuses of GLOH descriptors
+    static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
+
+// Number of GLOH angular bins (excluding the inner-most radial section)
+    static const unsigned GLOHAngularBins = 8;
+
+// Number of GLOH bins per histogram in descriptor
+    static const unsigned GLOHHistBins = 16;
+
     typedef struct
     {
         float    f[4];
@@ -639,9 +651,8 @@ namespace cpu
             int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
 
             int len = radius*2+1;
-            const int histlen = d*d*n;
 
-            for (int i = 0; i < histlen; i++)
+            for (int i = 0; i < (int)desc_len; i++)
                 desc[i] = 0.f;
 
             // Calculate orientation histogram
@@ -700,15 +711,154 @@ namespace cpu
                 }
             }
 
-            normalizeDesc(desc, histlen);
+            normalizeDesc(desc, desc_len);
+
+            for (int i = 0; i < (int)desc_len; i++)
+                desc[i] = min(desc[i], DescrMagThr);
+
+            normalizeDesc(desc, desc_len);
+
+            // Calculate final descriptor values
+            for (int k = 0; k < (int)desc_len; k++) {
+                desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
+            }
+        }
+    }
+
+// Computes GLOH feature descriptors for features in an array. Based on Section III-B
+// of Mikolajczyk and Schmid paper.
+    template<typename T>
+    void computeGLOHDescriptor(
+        float* desc_out,
+        const unsigned desc_len,
+        const float* x_in,
+        const float* y_in,
+        const unsigned* layer_in,
+        const float* response_in,
+        const float* size_in,
+        const float* ori_in,
+        const unsigned total_feat,
+        const std::vector< Array<T> >& gauss_pyr,
+        const int d,
+        const unsigned rb,
+        const unsigned ab,
+        const unsigned hb,
+        const float scale,
+        const unsigned octave,
+        const unsigned n_layers)
+    {
+        float desc[272];
+
+        for (unsigned f = 0; f < total_feat; f++) {
+            const unsigned layer = layer_in[f];
+            float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+            ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+            const float size = size_in[f];
+            const int fx = round(x_in[f] * scale);
+            const int fy = round(y_in[f] * scale);
+
+            // Points img to correct Gaussian pyramid layer
+            Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+            const T* img_ptr = img.get();
+            af::dim4 idims = img.dims();
+
+            float cos_t = cos(ori);
+            float sin_t = sin(ori);
+            float hist_bins_per_rad = hb / (PI_VAL * 2.f);
+            float polar_bins_per_rad = ab / (PI_VAL * 2.f);
+            float exp_denom = GLOHRadii[rb-1] * 0.5f;
+
+            float hist_width = DescrSclFctr * size * scale * 0.5f;
+
+            // Keep same descriptor radius used for SIFT
+            int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
+
+            // Alternative radius size calculation, changing the radius weight
+            // (rw) in the range of 0.25f-0.75f gives different results,
+            // increasing it tends to show a better recall rate but with a
+            // smaller amount of correct matches
+            //float rw = 0.5f;
+            //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
+
+            int len = radius*2+1;
+
+            for (int i = 0; i < (int)desc_len; i++)
+                desc[i] = 0.f;
+
+            // Calculate orientation histogram
+            for (int l = 0; l < len*len; l++) {
+                int i = l / len - radius;
+                int j = l % len - radius;
+
+                int y = fy + i;
+                int x = fx + j;
+
+                float x_rot = (j * cos_t - i * sin_t);
+                float y_rot = (j * sin_t + i * cos_t);
+
+                float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
+                float theta = atan2(y_rot, x_rot);
+                while (theta < 0.0f)
+                    theta += PI_VAL*2;
+                while (theta >= PI_VAL*2)
+                    theta -= PI_VAL*2;
+
+                float tbin = theta * polar_bins_per_rad;
+                float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
+                             ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
+                             min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
+
+                if (r <= GLOHRadii[rb-1] &&
+                    y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
+                    float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+                    float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+                    float grad_mag = sqrt(dx*dx + dy*dy);
+                    float grad_ori = atan2(dy, dx) - ori;
+                    while (grad_ori < 0.0f)
+                        grad_ori += PI_VAL*2;
+                    while (grad_ori >= PI_VAL*2)
+                        grad_ori -= PI_VAL*2;
+
+                    float w = exp(-r / exp_denom);
+                    float obin = grad_ori * hist_bins_per_rad;
+                    float mag = grad_mag*w;
+
+                    int t0 = floor(tbin);
+                    int r0 = floor(rbin);
+                    int o0 = floor(obin);
+                    tbin -= t0;
+                    rbin -= r0;
+                    obin -= o0;
+
+                    for (int rl = 0; rl <= 1; rl++) {
+                        int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
+                        float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
+                        if (rb >= 0 && rb <= 2) {
+                            for (int tl = 0; tl <= 1; tl++) {
+                                int tb = (t0 + tl) % ab;
+                                float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+                                for (int ol = 0; ol <= 1; ol++) {
+                                    int ob = (o0 + ol) % hb;
+                                    float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
+                                    unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
+                                    desc[idx] += v_o;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            normalizeDesc(desc, desc_len);
 
-            for (int i = 0; i < d*d*n; i++)
+            for (int i = 0; i < (int)desc_len; i++)
                 desc[i] = min(desc[i], DescrMagThr);
 
-            normalizeDesc(desc, histlen);
+            normalizeDesc(desc, desc_len);
 
             // Calculate final descriptor values
-            for (int k = 0; k < d*d*n; k++) {
+            for (int k = 0; k < (int)desc_len; k++) {
                 desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
             }
         }
@@ -815,7 +965,8 @@ namespace cpu
                        const Array<T>& in, const unsigned n_layers,
                        const float contrast_thr, const float edge_thr,
                        const float init_sigma, const bool double_input,
-                       const float img_scale, const float feature_ratio)
+                       const float img_scale, const float feature_ratio,
+                       const bool compute_GLOH)
     {
         af::dim4 idims = in.dims();
 
@@ -840,7 +991,10 @@ namespace cpu
 
         const unsigned d = DescrWidth;
         const unsigned n = DescrHistBins;
-        const unsigned desc_len = d*d*n;
+        const unsigned rb = GLOHRadialBins;
+        const unsigned ab = GLOHAngularBins;
+        const unsigned hb = GLOHHistBins;
+        const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
 
         for (unsigned i = 0; i < n_octaves; i++) {
             af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
@@ -907,7 +1061,7 @@ namespace cpu
 
             std::vector<feat_t> sorted_feat;
             array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
-            std::sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
+            std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
 
             memFree(interp_x);
             memFree(interp_y);
@@ -966,10 +1120,17 @@ namespace cpu
             float scale = 1.f/(1 << i);
             if (double_input) scale *= 2.f;
 
-            computeDescriptor<T>(desc, desc_len,
-                                 oriented_x, oriented_y, oriented_layer,
-                                 oriented_response, oriented_size, oriented_ori,
-                                 oriented_feat, gauss_pyr, d, n, scale, i, n_layers);
+            if (compute_GLOH)
+                computeGLOHDescriptor<T>(desc, desc_len,
+                                         oriented_x, oriented_y, oriented_layer,
+                                         oriented_response, oriented_size, oriented_ori,
+                                         oriented_feat, gauss_pyr, d, rb, ab, hb,
+                                         scale, i, n_layers);
+            else
+                computeDescriptor<T>(desc, desc_len,
+                                     oriented_x, oriented_y, oriented_layer,
+                                     oriented_response, oriented_size, oriented_ori,
+                                     oriented_feat, gauss_pyr, d, n, scale, i, n_layers);
 
             total_feat += oriented_feat;
             feat_pyr[i] = oriented_feat;
diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp
index 41cd8ce11b..3c6b1740d5 100644
--- a/src/backend/cpu/sobel.cpp
+++ b/src/backend/cpu/sobel.cpp
@@ -104,5 +104,7 @@ INSTANTIATE(int   , int)
 INSTANTIATE(uint  , int)
 INSTANTIATE(char  , int)
 INSTANTIATE(uchar , int)
+INSTANTIATE(short , int)
+INSTANTIATE(ushort, int)
 
 }
diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index 6c1ebb7cdd..0b3fb9aabe 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -81,4 +81,8 @@ namespace cpu
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 }
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp
index b96c6cc55a..4b0a092834 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/sort_by_key.cpp
@@ -122,6 +122,11 @@ namespace cpu
     INSTANTIATE(Tk, uint)      \
     INSTANTIATE(Tk, char)      \
     INSTANTIATE(Tk, uchar)     \
+    INSTANTIATE(Tk, short)     \
+    INSTANTIATE(Tk, ushort)    \
+    INSTANTIATE(Tk, intl)      \
+    INSTANTIATE(Tk, uintl)     \
+
 
     INSTANTIATE1(float)
     INSTANTIATE1(double)
@@ -129,4 +134,8 @@ namespace cpu
     INSTANTIATE1(uint)
     INSTANTIATE1(char)
     INSTANTIATE1(uchar)
+    INSTANTIATE1(short)
+    INSTANTIATE1(ushort)
+    INSTANTIATE1(intl)
+    INSTANTIATE1(uintl)
 }
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index 75690e062e..eb6b4bee60 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -105,4 +105,8 @@ namespace cpu
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 }
diff --git a/src/backend/cpu/surface.cpp b/src/backend/cpu/surface.cpp
new file mode 100644
index 0000000000..39f375a6fe
--- /dev/null
+++ b/src/backend/cpu/surface.cpp
@@ -0,0 +1,48 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_GRAPHICS)
+
+#include <Array.hpp>
+#include <surface.hpp>
+#include <err_cpu.hpp>
+#include <stdexcept>
+#include <graphics_common.hpp>
+#include <reduce.hpp>
+#include <memory.hpp>
+
+using af::dim4;
+
+namespace cpu
+{
+    template<typename T>
+    void copy_surface(const Array<T> &P, fg::Surface* surface)
+    {
+        CheckGL("Before CopyArrayToVBO");
+
+        glBindBuffer(GL_ARRAY_BUFFER, surface->vbo());
+        glBufferSubData(GL_ARRAY_BUFFER, 0, surface->size(), P.get());
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+        CheckGL("In CopyArrayToVBO");
+    }
+
+    #define INSTANTIATE(T)  \
+        template void copy_surface<T>(const Array<T> &P, fg::Surface* surface);
+
+    INSTANTIATE(float)
+    INSTANTIATE(double)
+    INSTANTIATE(int)
+    INSTANTIATE(uint)
+    INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+}
+
+#endif  // WITH_GRAPHICS
diff --git a/src/backend/cpu/surface.hpp b/src/backend/cpu/surface.hpp
new file mode 100644
index 0000000000..46a4c4b652
--- /dev/null
+++ b/src/backend/cpu/surface.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <Array.hpp>
+#include <graphics_common.hpp>
+
+namespace cpu
+{
+    template<typename T>
+    void copy_surface(const Array<T> &P, fg::Surface* surface);
+}
+
+#endif
+
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp
index ad5b702e5d..77493915c0 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/susan.cpp
@@ -111,14 +111,21 @@ unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
     memFree(resp);
 
     const unsigned corners_out = min(corners_found, corner_lim);
-    if (corners_out == 0)
+    if (corners_out == 0) {
+        memFree(x_corners);
+        memFree(y_corners);
+        memFree(resp_corners);
+        x_out    = createEmptyArray<float>(dim4());
+        y_out    = createEmptyArray<float>(dim4());
+        resp_out = createEmptyArray<float>(dim4());
         return 0;
+    } else {
 
-    x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)x_corners);
-    y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)y_corners);
-    resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)resp_corners);
-
-    return corners_out;
+        x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)x_corners);
+        y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)y_corners);
+        resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)resp_corners);
+        return corners_out;
+    }
 }
 
 #define INSTANTIATE(T) \
@@ -132,5 +139,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp
index 4ca30d2f3c..77e72afd09 100644
--- a/src/backend/cpu/tile.cpp
+++ b/src/backend/cpu/tile.cpp
@@ -71,5 +71,7 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp
index d1cf58e55e..68e8d96eba 100644
--- a/src/backend/cpu/transform.cpp
+++ b/src/backend/cpu/transform.cpp
@@ -142,4 +142,6 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp
index f820f9ea5d..bea0aa0d6f 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/transpose.cpp
@@ -159,6 +159,8 @@ INSTANTIATE(uint   )
 INSTANTIATE(uchar  )
 INSTANTIATE(intl   )
 INSTANTIATE(uintl  )
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 
 }
diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp
index 82c4fd1edc..6b0f326aad 100644
--- a/src/backend/cpu/triangle.cpp
+++ b/src/backend/cpu/triangle.cpp
@@ -85,5 +85,7 @@ Array<T> triangle(const Array<T> &in)
     INSTANTIATE(uintl)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/types.hpp b/src/backend/cpu/types.hpp
index a281b6b6a4..0776df783c 100644
--- a/src/backend/cpu/types.hpp
+++ b/src/backend/cpu/types.hpp
@@ -16,6 +16,7 @@ namespace cpu
     typedef std::complex<double>    cdouble;
     typedef unsigned int            uint;
     typedef unsigned char           uchar;
+    typedef unsigned short          ushort;
 
     template<typename T> struct is_complex          { static const bool value = false;  };
     template<> struct           is_complex<cfloat>  { static const bool value = true;   };
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 466da2e6c2..f9c25f9a9e 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -119,4 +119,6 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp
index c1ffd0f973..6c0f8c7acc 100644
--- a/src/backend/cpu/where.cpp
+++ b/src/backend/cpu/where.cpp
@@ -72,5 +72,7 @@ namespace cpu
     INSTANTIATE(intl   )
     INSTANTIATE(uintl  )
     INSTANTIATE(uchar  )
+    INSTANTIATE(short  )
+    INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp
index 1ed91500f8..a04a6f5250 100644
--- a/src/backend/cpu/wrap.cpp
+++ b/src/backend/cpu/wrap.cpp
@@ -119,4 +119,6 @@ namespace cpu
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp
index ed86a8e5ea..d7dbec56bc 100644
--- a/src/backend/cuda/Array.cpp
+++ b/src/backend/cuda/Array.cpp
@@ -46,7 +46,9 @@ namespace cuda
         static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
 #endif
         if (!is_device) {
-            CUDA_CHECK(cudaMemcpy(data.get(), in_data, dims.elements() * sizeof(T), cudaMemcpyHostToDevice));
+            CUDA_CHECK(cudaMemcpyAsync(data.get(), in_data, dims.elements() * sizeof(T),
+                        cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+            CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
         }
     }
 
@@ -73,7 +75,7 @@ namespace cuda
 
     template<typename T>
     Array<T>::Array(af::dim4 dims, JIT::Node_ptr n) :
-        info(-1, dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(), data_dims(dims),
         node(n), offset(0), ready(false), owner(true)
     {
@@ -238,9 +240,9 @@ namespace cuda
 
         T *ptr = arr.get();
 
-        CUDA_CHECK(cudaMemcpy(ptr + arr.getOffset(), data,
-                              bytes,
-                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data, bytes, cudaMemcpyHostToDevice,
+                    cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
 
         return;
     }
@@ -291,5 +293,7 @@ namespace cuda
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
index 696aba721c..bb8fca013c 100644
--- a/src/backend/cuda/CMakeLists.txt
+++ b/src/backend/cuda/CMakeLists.txt
@@ -69,9 +69,34 @@ ENDIF()
 ADD_DEFINITIONS(-DAF_CUDA)
 
 IF(${CUDA_VERSION_MAJOR} LESS 7)
-    MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available." )
+    # Use CPU Lapack as fallback?
+    OPTION(CUDA_LAPACK_CPU_FALLBACK "Use CPU LAPACK as fallback for CUDA LAPACK when CUDA is 6.5 or older" OFF)
+    MARK_AS_ADVANCED(CUDA_LAPACK_CPU_FALLBACK)
+
+    IF(${CUDA_LAPACK_CPU_FALLBACK})
+        ## Try to use CPU side lapack
+        IF(APPLE)
+            FIND_PACKAGE(LAPACK)
+        ELSE(APPLE) # Linux and Windows
+            FIND_PACKAGE(LAPACKE)
+        ENDIF(APPLE)
+
+        IF(NOT LAPACK_FOUND)
+            MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.")
+        ELSE(NOT LAPACK_FOUND)
+            MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. But CPU LAPACK libraries are available. Will fallback to using host side code.")
+            ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA)
+            IF(USE_CUDA_MKL)
+                MESSAGE("Using MKL")
+                ADD_DEFINITIONS(-DUSE_MKL)
+            ENDIF()
+        ENDIF()
+    ELSE()
+        MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.")
+    ENDIF()
     IF(CMAKE_VERSION VERSION_LESS 3.2)
         SET(CUDA_cusolver_LIBRARY)
+        MARK_AS_ADVANCED(CUDA_cusolver_LIBRARY)
     ENDIF(CMAKE_VERSION VERSION_LESS 3.2)
 ELSE(${CUDA_VERSION_MAJOR} LESS 7)
     MESSAGE(STATUS "CUDA cusolver library available in CUDA Version ${CUDA_VERSION_STRING}")
@@ -97,6 +122,10 @@ INCLUDE_DIRECTORIES(
     ${CUDA_NVVM_INCLUDE_DIR}
     )
 
+IF(CUDA_LAPACK_CPU_FALLBACK)
+  INCLUDE_DIRECTORIES(${LAPACK_INCLUDE_DIR})
+ENDIF()
+
 FILE(GLOB cuda_headers
      "*.hpp"
      "*.h")
@@ -121,6 +150,16 @@ SOURCE_GROUP(backend\\cuda\\Sources FILES ${cuda_sources})
 SOURCE_GROUP(backend\\cuda\\JIT FILES ${jit_sources})
 SOURCE_GROUP(backend\\cuda\\kernel\\Headers FILES ${kernel_headers})
 
+IF(CUDA_LAPACK_CPU_FALLBACK)
+    FILE(GLOB cpu_lapack_sources
+        "cpu_lapack/*.cpp")
+    FILE(GLOB cpu_lapack_headers
+        "cpu_lapack/*.hpp")
+
+    SOURCE_GROUP(backend\\cuda\\cpu_lapack\\Headers FILES ${cpu_lapack_headers})
+    SOURCE_GROUP(backend\\cuda\\cpu_lapack\\Sources FILES ${cpu_lapack_sources})
+ENDIF()
+
 FILE(GLOB backend_headers
     "../*.hpp"
     "../*.h"
@@ -256,6 +295,8 @@ MY_CUDA_ADD_LIBRARY(afcuda SHARED
                 ${cuda_sources}
                 ${jit_sources}
                 ${kernel_headers}
+                ${cpu_lapack_headers}
+                ${cpu_lapack_sources}
                 ${backend_headers}
                 ${backend_sources}
                 ${c_headers}
@@ -277,6 +318,10 @@ IF(FORGE_FOUND)
     TARGET_LINK_LIBRARIES(afcuda PRIVATE ${FORGE_LIBRARIES})
 ENDIF()
 
+IF(CUDA_LAPACK_CPU_FALLBACK)
+  TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES})
+ENDIF()
+
 SET_TARGET_PROPERTIES(afcuda PROPERTIES
     VERSION "${AF_VERSION}"
     SOVERSION "${AF_VERSION_MAJOR}")
diff --git a/src/backend/cuda/JIT/arith.cu b/src/backend/cuda/JIT/arith.cu
index 01e5f41b9f..adfa9e9068 100644
--- a/src/backend/cuda/JIT/arith.cu
+++ b/src/backend/cuda/JIT/arith.cu
@@ -25,6 +25,8 @@
     ARITH_BASIC(fn, op, uchar)                          \
     ARITH_BASIC(fn, op, intl)                           \
     ARITH_BASIC(fn, op, uintl)                          \
+    ARITH_BASIC(fn, op, short)                          \
+    ARITH_BASIC(fn, op, ushort)                         \
                                                         \
     __device__ cfloat ___##fn(cfloat a, cfloat b)       \
     {                                                   \
diff --git a/src/backend/cuda/JIT/cast.cu b/src/backend/cuda/JIT/cast.cu
index db41c524c1..8905955145 100644
--- a/src/backend/cuda/JIT/cast.cu
+++ b/src/backend/cuda/JIT/cast.cu
@@ -22,14 +22,18 @@
     CAST_BASIC(___mk##X, T, uchar)              \
     CAST_BASIC(___mk##X, T, intl)               \
     CAST_BASIC(___mk##X, T, uintl)              \
+    CAST_BASIC(___mk##X, T, short)              \
+    CAST_BASIC(___mk##X, T, ushort)             \
 
-CAST(float, S)
+CAST(float , S)
 CAST(double, D)
-CAST(int, I)
-CAST(intl, X)
-CAST(uint, U)
-CAST(uchar, V)
-CAST(uintl, Y)
+CAST(int   , I)
+CAST(intl  , X)
+CAST(short , P)
+CAST(uint  , U)
+CAST(uchar , V)
+CAST(uintl , Y)
+CAST(ushort, Q)
 
 CAST_BASIC_BOOL(___mkJ, char, float)
 CAST_BASIC_BOOL(___mkJ, char, double)
@@ -39,6 +43,8 @@ CAST_BASIC_BOOL(___mkJ, char, char)
 CAST_BASIC_BOOL(___mkJ, char, uchar)
 CAST_BASIC_BOOL(___mkJ, char, intl)
 CAST_BASIC_BOOL(___mkJ, char, uintl)
+CAST_BASIC_BOOL(___mkJ, char, short)
+CAST_BASIC_BOOL(___mkJ, char, ushort)
 
 #define CPLX_BASIC(FN, To, Tr, Ti)              \
     __device__ To FN(Ti in)                     \
@@ -56,6 +62,8 @@ CAST_BASIC_BOOL(___mkJ, char, uintl)
     CPLX_BASIC(___mk##X, T, Tr, uchar)          \
     CPLX_BASIC(___mk##X, T, Tr, uintl)          \
     CPLX_BASIC(___mk##X, T, Tr, intl)           \
+    CPLX_BASIC(___mk##X, T, Tr, ushort)         \
+    CPLX_BASIC(___mk##X, T, Tr, short)          \
 
 CPLX_CAST(cfloat, float, C)
 CPLX_CAST(cdouble, double, Z)
diff --git a/src/backend/cuda/JIT/exp.cu b/src/backend/cuda/JIT/exp.cu
index 23a33004bc..3f110b4328 100644
--- a/src/backend/cuda/JIT/exp.cu
+++ b/src/backend/cuda/JIT/exp.cu
@@ -34,6 +34,8 @@ __device__ float sigmoidf(float in)
     MATH_BASIC(fn, uchar)                       \
     MATH_BASIC(fn, uintl)                       \
     MATH_BASIC(fn, intl)                        \
+    MATH_BASIC(fn, ushort)                      \
+    MATH_BASIC(fn, short)                       \
     __device__ double ___##fn(double a)         \
     {                                           \
         return fn(a);                           \
@@ -68,6 +70,8 @@ MATH(cbrt)
     MATH2_BASIC(fn, uchar)                          \
     MATH2_BASIC(fn, uintl)                          \
     MATH2_BASIC(fn, intl)                           \
+    MATH2_BASIC(fn, ushort)                         \
+    MATH2_BASIC(fn, short)                          \
     __device__ double ___##fn(double a, double b)   \
     {                                               \
         return fn(a, b);                            \
diff --git a/src/backend/cuda/JIT/hyper.cu b/src/backend/cuda/JIT/hyper.cu
index 00ea2da33c..6673fb1f14 100644
--- a/src/backend/cuda/JIT/hyper.cu
+++ b/src/backend/cuda/JIT/hyper.cu
@@ -24,6 +24,8 @@
     MATH_BASIC(fn, uchar)                       \
     MATH_BASIC(fn, uintl)                       \
     MATH_BASIC(fn, intl)                        \
+    MATH_BASIC(fn, ushort)                      \
+    MATH_BASIC(fn, short)                       \
     __device__ double ___##fn(double a)         \
     {                                           \
         return fn(a);                           \
diff --git a/src/backend/cuda/JIT/logic.cu b/src/backend/cuda/JIT/logic.cu
index 883f3dbc5b..6072c3c447 100644
--- a/src/backend/cuda/JIT/logic.cu
+++ b/src/backend/cuda/JIT/logic.cu
@@ -25,6 +25,8 @@
     LOGIC_BASIC(fn, op, uchar)                      \
     LOGIC_BASIC(fn, op, intl)                       \
     LOGIC_BASIC(fn, op, uintl)                      \
+    LOGIC_BASIC(fn, op, short)                      \
+    LOGIC_BASIC(fn, op, ushort)                     \
                                                     \
     __device__ bool ___##fn(cfloat a, cfloat b)     \
     {                                               \
@@ -52,6 +54,8 @@ LOGIC(or, ||)
     LOGIC_BASIC(fn, op, uchar)                      \
     LOGIC_BASIC(fn, op, intl)                       \
     LOGIC_BASIC(fn, op, uintl)                      \
+    LOGIC_BASIC(fn, op, short)                      \
+    LOGIC_BASIC(fn, op, ushort)                     \
                                                     \
     __device__ bool ___##fn(cfloat a, cfloat b)     \
     {                                               \
@@ -77,6 +81,8 @@ NOT_FN(char)
 NOT_FN(uchar)
 NOT_FN(intl)
 NOT_FN(uintl)
+NOT_FN(short)
+NOT_FN(ushort)
 
 #define BIT_FN(T)                                                   \
     __device__ T ___bitand   (T lhs, T rhs) { return lhs &  rhs; }  \
@@ -91,6 +97,8 @@ BIT_FN(intl)
 BIT_FN(uchar)
 BIT_FN(uint)
 BIT_FN(uintl)
+BIT_FN(short)
+BIT_FN(ushort)
 
 __device__ char ___isNaN(float in) { return isnan(in); }
 __device__ char ___isINF(float in) { return isinf(in); }
diff --git a/src/backend/cuda/JIT/numeric.cu b/src/backend/cuda/JIT/numeric.cu
index 158cc243cd..8253db6d22 100644
--- a/src/backend/cuda/JIT/numeric.cu
+++ b/src/backend/cuda/JIT/numeric.cu
@@ -39,6 +39,8 @@ MATH_NOOP(floor, char)
 MATH_NOOP(floor, uchar)
 MATH_NOOP(floor, uintl)
 MATH_NOOP(floor, intl)
+MATH_NOOP(floor, ushort)
+MATH_NOOP(floor, short)
 
 MATH_BASIC(ceil, float)
 MATH_BASIC(ceil, double)
@@ -48,6 +50,8 @@ MATH_NOOP(ceil, char)
 MATH_NOOP(ceil, uchar)
 MATH_NOOP(ceil, uintl)
 MATH_NOOP(ceil, intl)
+MATH_NOOP(ceil, ushort)
+MATH_NOOP(ceil, short)
 
 MATH_BASIC(round, float)
 MATH_BASIC(round, double)
@@ -57,6 +61,8 @@ MATH_NOOP(round, char)
 MATH_NOOP(round, uchar)
 MATH_NOOP(round, uintl)
 MATH_NOOP(round, intl)
+MATH_NOOP(round, ushort)
+MATH_NOOP(round, short)
 
 MATH_BASIC(trunc, float)
 MATH_BASIC(trunc, double)
@@ -66,6 +72,8 @@ MATH_NOOP(trunc, char)
 MATH_NOOP(trunc, uchar)
 MATH_NOOP(trunc, uintl)
 MATH_NOOP(trunc, intl)
+MATH_NOOP(trunc, ushort)
+MATH_NOOP(trunc, short)
 
 MATH_BASIC(sign, float)
 MATH_BASIC(sign, double)
@@ -75,6 +83,8 @@ MATH_NOOP(sign, char)
 MATH_NOOP(sign, uchar)
 MATH_NOOP(sign, uintl)
 MATH_NOOP(sign, intl)
+MATH_NOOP(sign, ushort)
+MATH_NOOP(sign, short)
 
 MATH_BASIC(abs, float)
 MATH_BASIC(abs, double)
@@ -84,24 +94,30 @@ MATH_NOOP(abs, uint)
 MATH_NOOP(abs, uchar)
 MATH_NOOP(abs, uintl)
 MATH_NOOP(abs, intl)
+MATH_NOOP(abs, ushort)
+MATH_NOOP(abs, short)
 
 MATH_BASIC(tgamma, float)
 MATH_BASIC(tgamma, double)
-MATH_CAST(tgamma, int, float)
-MATH_CAST(tgamma, uint, float)
-MATH_CAST(tgamma, char, float)
-MATH_CAST(tgamma, uchar, float)
-MATH_CAST(tgamma, uintl, float)
-MATH_CAST(tgamma, intl, float)
+MATH_CAST(tgamma, int   , float)
+MATH_CAST(tgamma, uint  , float)
+MATH_CAST(tgamma, char  , float)
+MATH_CAST(tgamma, uchar , float)
+MATH_CAST(tgamma, uintl , float)
+MATH_CAST(tgamma, intl  , float)
+MATH_CAST(tgamma, ushort, float)
+MATH_CAST(tgamma, short , float)
 
 MATH_BASIC(lgamma, float)
 MATH_BASIC(lgamma, double)
-MATH_CAST(lgamma, int, float)
-MATH_CAST(lgamma, uint, float)
-MATH_CAST(lgamma, char, float)
-MATH_CAST(lgamma, uchar, float)
-MATH_CAST(lgamma, uintl, float)
-MATH_CAST(lgamma, intl, float)
+MATH_CAST(lgamma, int   , float)
+MATH_CAST(lgamma, uint  , float)
+MATH_CAST(lgamma, char  , float)
+MATH_CAST(lgamma, uchar , float)
+MATH_CAST(lgamma, uintl , float)
+MATH_CAST(lgamma, intl  , float)
+MATH_CAST(lgamma, ushort, float)
+MATH_CAST(lgamma, short , float)
 
 __device__ float ___abs(cfloat a) { return cuCabsf(a); }
 __device__ double ___abs(cdouble a) { return cuCabs(a); }
@@ -128,6 +144,8 @@ __device__ double mod(double a, double b) { return fmod(a, b); }
     MATH2_BASIC(fn, uintl)                          \
     MATH2_BASIC(fn, char)                           \
     MATH2_BASIC(fn, uchar)                          \
+    MATH2_BASIC(fn, short)                          \
+    MATH2_BASIC(fn, ushort)                         \
     __device__ double ___##fn(double a, double b)   \
     {                                               \
         return fn(a, b);                            \
diff --git a/src/backend/cuda/JIT/trig.cu b/src/backend/cuda/JIT/trig.cu
index 28f098ed8e..372bd4d026 100644
--- a/src/backend/cuda/JIT/trig.cu
+++ b/src/backend/cuda/JIT/trig.cu
@@ -24,6 +24,8 @@
     MATH_BASIC(fn, uchar)                       \
     MATH_BASIC(fn, uintl)                       \
     MATH_BASIC(fn, intl)                        \
+    MATH_BASIC(fn, ushort)                      \
+    MATH_BASIC(fn, short)                       \
     __device__ double ___##fn(double a)         \
     {                                           \
         return fn(a);                           \
@@ -51,6 +53,8 @@ ATAN2(char)
 ATAN2(uchar)
 ATAN2(uintl)
 ATAN2(intl)
+ATAN2(ushort)
+ATAN2(short)
 
 __device__ double ___atan2(double x, double y)
 {
diff --git a/src/backend/cuda/JIT/types.h b/src/backend/cuda/JIT/types.h
index 80314bc34d..4a97ef3842 100644
--- a/src/backend/cuda/JIT/types.h
+++ b/src/backend/cuda/JIT/types.h
@@ -11,6 +11,7 @@
 #include <math_functions.h>
 typedef unsigned char uchar;
 typedef unsigned int uint;
+typedef unsigned short ushort;
 typedef cuFloatComplex cfloat;
 typedef cuDoubleComplex cdouble;
 typedef long long intl;
diff --git a/src/backend/cuda/all.cu b/src/backend/cuda/all.cu
index bfc070a7b6..b70f98ab28 100644
--- a/src/backend/cuda/all.cu
+++ b/src/backend/cuda/all.cu
@@ -22,4 +22,6 @@ namespace cuda
     INSTANTIATE(af_and_t, uintl  , char)
     INSTANTIATE(af_and_t, char   , char)
     INSTANTIATE(af_and_t, uchar  , char)
+    INSTANTIATE(af_and_t, short  , char)
+    INSTANTIATE(af_and_t, ushort , char)
 }
diff --git a/src/backend/cuda/any.cu b/src/backend/cuda/any.cu
index 836970e61a..aa13fbb67b 100644
--- a/src/backend/cuda/any.cu
+++ b/src/backend/cuda/any.cu
@@ -22,4 +22,6 @@ namespace cuda
     INSTANTIATE(af_or_t, uintl  , char)
     INSTANTIATE(af_or_t, char   , char)
     INSTANTIATE(af_or_t, uchar  , char)
+    INSTANTIATE(af_or_t, short  , char)
+    INSTANTIATE(af_or_t, ushort , char)
 }
diff --git a/src/backend/cuda/assign.cu b/src/backend/cuda/assign.cu
index 7bea851fdd..7d00b15c5f 100644
--- a/src/backend/cuda/assign.cu
+++ b/src/backend/cuda/assign.cu
@@ -69,11 +69,13 @@ INSTANTIATE(cdouble)
 INSTANTIATE(double )
 INSTANTIATE(cfloat )
 INSTANTIATE(float  )
-INSTANTIATE(uintl  )
+INSTANTIATE(int    )
 INSTANTIATE(uint   )
 INSTANTIATE(intl   )
-INSTANTIATE(int    )
-INSTANTIATE(uchar  )
+INSTANTIATE(uintl  )
 INSTANTIATE(char   )
+INSTANTIATE(uchar  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/cuda/bilateral.cu b/src/backend/cuda/bilateral.cu
index 4c1d7fc6f9..bdb19fdef5 100644
--- a/src/backend/cuda/bilateral.cu
+++ b/src/backend/cuda/bilateral.cu
@@ -37,5 +37,7 @@ INSTANTIATE(char  ,  float)
 INSTANTIATE(int   ,  float)
 INSTANTIATE(uint  ,  float)
 INSTANTIATE(uchar ,  float)
+INSTANTIATE(short ,  float)
+INSTANTIATE(ushort,  float)
 
 }
diff --git a/src/backend/cuda/cholesky.cu b/src/backend/cuda/cholesky.cu
index d785eef3ef..c6869dc6a6 100644
--- a/src/backend/cuda/cholesky.cu
+++ b/src/backend/cuda/cholesky.cu
@@ -148,6 +148,34 @@ INSTANTIATE_CH(double)
 INSTANTIATE_CH(cdouble)
 }
 
+#elif defined(WITH_CPU_LINEAR_ALGEBRA)
+#include <cpu_lapack/cpu_cholesky.hpp>
+namespace cuda
+{
+
+template<typename T>
+Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
+{
+    return cpu::cholesky(info, in, is_upper);
+}
+
+template<typename T>
+int cholesky_inplace(Array<T> &in, const bool is_upper)
+{
+    return cpu::cholesky_inplace(in, is_upper);
+}
+
+#define INSTANTIATE_CH(T)                                                                   \
+    template int cholesky_inplace<T>(Array<T> &in, const bool is_upper);                    \
+    template Array<T> cholesky<T>   (int *info, const Array<T> &in, const bool is_upper);
+
+INSTANTIATE_CH(float)
+INSTANTIATE_CH(cfloat)
+INSTANTIATE_CH(double)
+INSTANTIATE_CH(cdouble)
+
+}
+
 #else
 namespace cuda
 {
diff --git a/src/backend/cuda/convolve.cpp b/src/backend/cuda/convolve.cpp
index 9f14e6a3bd..5f2e57c07b 100644
--- a/src/backend/cuda/convolve.cpp
+++ b/src/backend/cuda/convolve.cpp
@@ -96,5 +96,9 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(uintl  ,   float)
+INSTANTIATE(intl   ,   float)
 
 }
diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu
index 1f23804cbb..90f9970239 100644
--- a/src/backend/cuda/copy.cu
+++ b/src/backend/cuda/copy.cu
@@ -120,16 +120,18 @@ namespace cuda
     template Array<T> copyArray<T>(const Array<T> &A);              \
     template void      multiply_inplace<T> (Array<T> &in, double norm); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
+    INSTANTIATE(float  )
+    INSTANTIATE(double )
+    INSTANTIATE(cfloat )
     INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
+    INSTANTIATE(int    )
+    INSTANTIATE(uint   )
+    INSTANTIATE(uchar  )
+    INSTANTIATE(char   )
     INSTANTIATE(intl   )
     INSTANTIATE(uintl  )
+    INSTANTIATE(short  )
+    INSTANTIATE(ushort )
 
 #define INSTANTIATE_PAD_ARRAY(SRC_T)                                    \
     template Array<float  > padArray<SRC_T, float  >(Array<SRC_T> const &src, dim4 const &dims, float   default_value, double factor); \
@@ -138,8 +140,10 @@ namespace cuda
     template Array<cdouble> padArray<SRC_T, cdouble>(Array<SRC_T> const &src, dim4 const &dims, cdouble default_value, double factor); \
     template Array<int    > padArray<SRC_T, int    >(Array<SRC_T> const &src, dim4 const &dims, int     default_value, double factor); \
     template Array<uint   > padArray<SRC_T, uint   >(Array<SRC_T> const &src, dim4 const &dims, uint    default_value, double factor); \
-    template Array<intl    > padArray<SRC_T, intl    >(Array<SRC_T> const &src, dim4 const &dims, intl     default_value, double factor); \
-    template Array<uintl   > padArray<SRC_T, uintl   >(Array<SRC_T> const &src, dim4 const &dims, uintl    default_value, double factor); \
+    template Array<intl   > padArray<SRC_T, intl   >(Array<SRC_T> const &src, dim4 const &dims, intl    default_value, double factor); \
+    template Array<uintl  > padArray<SRC_T, uintl  >(Array<SRC_T> const &src, dim4 const &dims, uintl   default_value, double factor); \
+    template Array<short  > padArray<SRC_T, short  >(Array<SRC_T> const &src, dim4 const &dims, short   default_value, double factor); \
+    template Array<ushort > padArray<SRC_T, ushort >(Array<SRC_T> const &src, dim4 const &dims, ushort  default_value, double factor); \
     template Array<uchar  > padArray<SRC_T, uchar  >(Array<SRC_T> const &src, dim4 const &dims, uchar   default_value, double factor); \
     template Array<char   > padArray<SRC_T, char   >(Array<SRC_T> const &src, dim4 const &dims, char    default_value, double factor); \
     template void copyArray<SRC_T, float  >(Array<float  > &dst, Array<SRC_T> const &src); \
@@ -148,8 +152,10 @@ namespace cuda
     template void copyArray<SRC_T, cdouble>(Array<cdouble> &dst, Array<SRC_T> const &src); \
     template void copyArray<SRC_T, int    >(Array<int    > &dst, Array<SRC_T> const &src); \
     template void copyArray<SRC_T, uint   >(Array<uint   > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, intl    >(Array<intl    > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, uintl   >(Array<uintl   > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, intl   >(Array<intl   > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, uintl  >(Array<uintl  > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, short  >(Array<short  > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, ushort >(Array<ushort > &dst, Array<SRC_T> const &src); \
     template void copyArray<SRC_T, uchar  >(Array<uchar  > &dst, Array<SRC_T> const &src); \
     template void copyArray<SRC_T, char   >(Array<char   > &dst, Array<SRC_T> const &src);
 
@@ -157,8 +163,10 @@ namespace cuda
     INSTANTIATE_PAD_ARRAY(double)
     INSTANTIATE_PAD_ARRAY(int   )
     INSTANTIATE_PAD_ARRAY(uint  )
-    INSTANTIATE_PAD_ARRAY(intl   )
-    INSTANTIATE_PAD_ARRAY(uintl  )
+    INSTANTIATE_PAD_ARRAY(intl  )
+    INSTANTIATE_PAD_ARRAY(uintl )
+    INSTANTIATE_PAD_ARRAY(short )
+    INSTANTIATE_PAD_ARRAY(ushort)
     INSTANTIATE_PAD_ARRAY(uchar )
     INSTANTIATE_PAD_ARRAY(char  )
 
@@ -185,6 +193,8 @@ namespace cuda
     SPECILIAZE_UNUSED_COPYARRAY(cfloat, int)
     SPECILIAZE_UNUSED_COPYARRAY(cfloat, intl)
     SPECILIAZE_UNUSED_COPYARRAY(cfloat, uintl)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat, short)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat, ushort)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
@@ -193,4 +203,6 @@ namespace cuda
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
+    SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
+    SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
 }
diff --git a/src/backend/cuda/count.cu b/src/backend/cuda/count.cu
index d6241414bb..365897f75d 100644
--- a/src/backend/cuda/count.cu
+++ b/src/backend/cuda/count.cu
@@ -20,6 +20,8 @@ namespace cuda
     INSTANTIATE(af_notzero_t, uint   , uint)
     INSTANTIATE(af_notzero_t, intl   , uint)
     INSTANTIATE(af_notzero_t, uintl  , uint)
+    INSTANTIATE(af_notzero_t, short  , uint)
+    INSTANTIATE(af_notzero_t, ushort , uint)
     INSTANTIATE(af_notzero_t, char   , uint)
     INSTANTIATE(af_notzero_t, uchar  , uint)
 }
diff --git a/src/backend/cuda/cpu_lapack/cpu_cholesky.cpp b/src/backend/cuda/cpu_lapack/cpu_cholesky.cpp
new file mode 100644
index 0000000000..29826dcecb
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_cholesky.cpp
@@ -0,0 +1,109 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_CPU_LINEAR_ALGEBRA)
+
+#include <cpu_lapack/cpu_cholesky.hpp>
+#include <err_common.hpp>
+#include <af/dim4.hpp>
+#include <handle.hpp>
+#include <copy.hpp>
+#include <iostream>
+#include <cassert>
+
+#include <cpu_lapack/cpu_triangle.hpp>
+#include "lapack_helper.hpp"
+
+namespace cuda
+{
+namespace cpu
+{
+
+template<typename T>
+using potrf_func_def = int (*)(ORDER_TYPE, char,
+                               int,
+                               T*, int);
+
+#define CH_FUNC_DEF( FUNC )                                     \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define CH_FUNC( FUNC, TYPE, PREFIX )                           \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()        \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+CH_FUNC_DEF( potrf )
+CH_FUNC(potrf , float  , s)
+CH_FUNC(potrf , double , d)
+CH_FUNC(potrf , cfloat , c)
+CH_FUNC(potrf , cdouble, z)
+
+template<typename T>
+Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
+{
+    dim4 iDims = in.dims();
+    int N = iDims[0];
+
+    char uplo = 'L';
+    if(is_upper)
+        uplo = 'U';
+
+    T *inPtr = pinnedAlloc<T>(in.elements());
+    copyData(inPtr, in);
+
+    *info = potrf_func<T>()(AF_LAPACK_COL_MAJOR, uplo,
+                            N, inPtr, in.strides()[1]);
+
+    if (is_upper) triangle<T, true , false>(inPtr, inPtr, in.dims(), in.strides(), in.strides());
+    else          triangle<T, false, false>(inPtr, inPtr, in.dims(), in.strides(), in.strides());
+
+    Array<T> out = createHostDataArray<T>(in.dims(), inPtr);
+
+    pinnedFree(inPtr);
+
+    return out;
+}
+
+template<typename T>
+int cholesky_inplace(Array<T> &in, const bool is_upper)
+{
+    dim4 iDims = in.dims();
+    int N = iDims[0];
+
+    char uplo = 'L';
+    if(is_upper)
+        uplo = 'U';
+
+    T *inPtr = pinnedAlloc<T>(in.elements());
+    copyData(inPtr, in);
+
+    int info = potrf_func<T>()(AF_LAPACK_COL_MAJOR, uplo,
+                               N, inPtr, in.strides()[1]);
+
+    writeHostDataArray<T>(in, inPtr, in.elements() * sizeof(T));
+
+    pinnedFree(inPtr);
+
+    return info;
+}
+
+#define INSTANTIATE_CH(T)                                                                   \
+    template int cholesky_inplace<T>(Array<T> &in, const bool is_upper);                    \
+    template Array<T> cholesky<T>   (int *info, const Array<T> &in, const bool is_upper);   \
+
+
+INSTANTIATE_CH(float)
+INSTANTIATE_CH(cfloat)
+INSTANTIATE_CH(double)
+INSTANTIATE_CH(cdouble)
+
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/cpu_cholesky.hpp b/src/backend/cuda/cpu_lapack/cpu_cholesky.hpp
new file mode 100644
index 0000000000..03f9fa80d8
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_cholesky.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+    template<typename T>
+    Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper);
+
+    template<typename T>
+    int cholesky_inplace(Array<T> &in, const bool is_upper);
+}
+}
diff --git a/src/backend/cuda/cpu_lapack/cpu_inverse.cpp b/src/backend/cuda/cpu_lapack/cpu_inverse.cpp
new file mode 100644
index 0000000000..a0ddf39335
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_inverse.cpp
@@ -0,0 +1,92 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_CPU_LINEAR_ALGEBRA)
+
+#include <inverse.hpp>
+#include <err_common.hpp>
+
+#include <af/dim4.hpp>
+#include <handle.hpp>
+#include <identity.hpp>
+#include <copy.hpp>
+#include <iostream>
+#include <cassert>
+
+#include "lapack_helper.hpp"
+#include <cpu_lapack/cpu_lu.hpp>
+#include <cpu_lapack/cpu_solve.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+
+template<typename T>
+using getri_func_def = int (*)(ORDER_TYPE, int,
+                               T *, int,
+                               const int *);
+
+#define INV_FUNC_DEF( FUNC )                                        \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+#define INV_FUNC( FUNC, TYPE, PREFIX )                              \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()            \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+INV_FUNC_DEF( getri )
+INV_FUNC(getri , float  , s)
+INV_FUNC(getri , double , d)
+INV_FUNC(getri , cfloat , c)
+INV_FUNC(getri , cdouble, z)
+
+template<typename T>
+Array<T> inverse(const Array<T> &in)
+{
+    int M = in.dims()[0];
+    int N = in.dims()[1];
+
+    if (M != N) {
+        Array<T> I = identity<T>(in.dims());
+        return cpu::solve(in, I);
+    }
+
+    Array<T> A = copyArray<T>(in);
+
+    Array<int> pivot = lu_inplace<T>(A, false);
+
+    T *aPtr = pinnedAlloc<T>(A.elements());
+    int *pPtr = pinnedAlloc<int>(pivot.elements());
+    copyData(aPtr, A);
+    copyData(pPtr, pivot);
+
+    getri_func<T>()(AF_LAPACK_COL_MAJOR, M,
+                    aPtr, A.strides()[1],
+                    pPtr);
+
+    writeHostDataArray<T>(A, aPtr, A.elements() * sizeof(T));
+
+    pinnedFree(aPtr);
+    pinnedFree(pPtr);
+
+    return A;
+}
+
+#define INSTANTIATE(T)                                                                   \
+    template Array<T> inverse<T> (const Array<T> &in);
+
+INSTANTIATE(float)
+INSTANTIATE(cfloat)
+INSTANTIATE(double)
+INSTANTIATE(cdouble)
+
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/cpu_inverse.hpp b/src/backend/cuda/cpu_lapack/cpu_inverse.hpp
new file mode 100644
index 0000000000..f45fdee990
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_inverse.hpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+    template<typename T>
+    Array<T> inverse(const Array<T> &in);
+}
+}
diff --git a/src/backend/cuda/cpu_lapack/cpu_lu.cpp b/src/backend/cuda/cpu_lapack/cpu_lu.cpp
new file mode 100644
index 0000000000..ea8313206a
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_lu.cpp
@@ -0,0 +1,197 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_CPU_LINEAR_ALGEBRA)
+
+#include <cpu_lapack/cpu_lu.hpp>
+#include <err_common.hpp>
+
+#include <af/dim4.hpp>
+#include <handle.hpp>
+#include <iostream>
+#include <cassert>
+#include <err_cuda.hpp>
+
+#include "lapack_helper.hpp"
+
+namespace cuda
+{
+namespace cpu
+{
+
+template<typename T>
+using getrf_func_def = int (*)(ORDER_TYPE, int, int,
+                               T*, int,
+                               int*);
+
+#define LU_FUNC_DEF( FUNC )                                     \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define LU_FUNC( FUNC, TYPE, PREFIX )                           \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()        \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+LU_FUNC_DEF( getrf )
+LU_FUNC(getrf , float  , s)
+LU_FUNC(getrf , double , d)
+LU_FUNC(getrf , cfloat , c)
+LU_FUNC(getrf , cdouble, z)
+
+template<typename T>
+void lu_split(T *l, T *u, const T *i,
+        const dim4 ldm, const dim4 udm, const dim4 idm,
+        const dim4 lst, const dim4 ust, const dim4 ist)
+{
+    for(dim_t ow = 0; ow < idm[3]; ow++) {
+        const dim_t lW = ow * lst[3];
+        const dim_t uW = ow * ust[3];
+        const dim_t iW = ow * ist[3];
+
+        for(dim_t oz = 0; oz < idm[2]; oz++) {
+            const dim_t lZW = lW + oz * lst[2];
+            const dim_t uZW = uW + oz * ust[2];
+            const dim_t iZW = iW + oz * ist[2];
+
+            for(dim_t oy = 0; oy < idm[1]; oy++) {
+                const dim_t lYZW = lZW + oy * lst[1];
+                const dim_t uYZW = uZW + oy * ust[1];
+                const dim_t iYZW = iZW + oy * ist[1];
+
+                for(dim_t ox = 0; ox < idm[0]; ox++) {
+                    const dim_t lMem = lYZW + ox;
+                    const dim_t uMem = uYZW + ox;
+                    const dim_t iMem = iYZW + ox;
+                    if(ox > oy) {
+                        if(oy < ldm[1])
+                            l[lMem] = i[iMem];
+                        if(ox < udm[0])
+                            u[uMem] = scalar<T>(0);
+                    } else if (oy > ox) {
+                        if(oy < ldm[1])
+                            l[lMem] = scalar<T>(0);
+                        if(ox < udm[0])
+                            u[uMem] = i[iMem];
+                    } else if(ox == oy) {
+                        if(oy < ldm[1])
+                            l[lMem] = scalar<T>(1.0);
+                        if(ox < udm[0])
+                            u[uMem] = i[iMem];
+                    }
+                }
+            }
+        }
+    }
+}
+
+void convertPivot(int **pivot, int out_sz, dim_t d0)
+{
+    int* p = pinnedAlloc<int>(out_sz);
+    for(int i = 0; i < out_sz; i++)
+        p[i] = i;
+
+    for(int j = 0; j < (int)d0; j++) {
+        // 1 indexed in pivot
+        std::swap(p[j], p[(*pivot)[j] - 1]);
+    }
+
+    pinnedFree(*pivot);
+    *pivot = p;
+}
+
+template<typename T>
+void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    Array<T> in_copy = copyArray<T>(in);
+
+    //////////////////////////////////////////
+    // LU inplace
+    int *pivotPtr  = pinnedAlloc<int>(min(M, N));
+    T   *inPtr     = pinnedAlloc<T>  (in_copy.elements());
+    copyData(inPtr, in);
+
+    getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+                    inPtr, in_copy.strides()[1],
+                    pivotPtr);
+
+    convertPivot(&pivotPtr, M, min(M, N));
+
+    pivot = createHostDataArray<int>(af::dim4(M), pivotPtr);
+    //////////////////////////////////////////
+
+    // SPLIT into lower and upper
+    dim4 ldims(M, min(M, N));
+    dim4 udims(min(M, N), N);
+
+    T *lowerPtr = pinnedAlloc<T>(ldims.elements());
+    T *upperPtr = pinnedAlloc<T>(udims.elements());
+
+    dim4 lst(1, ldims[0], ldims[0] * ldims[1], ldims[0] * ldims[1] * ldims[2]);
+    dim4 ust(1, udims[0], udims[0] * udims[1], udims[0] * udims[1] * udims[2]);
+
+    lu_split<T>(lowerPtr, upperPtr, inPtr, ldims, udims, iDims,
+                lst, ust, in_copy.strides());
+
+    lower = createHostDataArray<T>(ldims, lowerPtr);
+    upper = createHostDataArray<T>(udims, upperPtr);
+
+    lower.eval();
+    upper.eval();
+
+    pinnedFree(lowerPtr);
+    pinnedFree(upperPtr);
+    pinnedFree(pivotPtr);
+    pinnedFree(inPtr);
+}
+
+template<typename T>
+Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    int *pivotPtr  = pinnedAlloc<int>(min(M, N));
+    T   *inPtr     = pinnedAlloc<T>  (in.elements());
+    copyData(inPtr, in);
+
+    getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+                    inPtr, in.strides()[1],
+                    pivotPtr);
+
+    if(convert_pivot) convertPivot(&pivotPtr, M, min(M, N));
+
+    writeHostDataArray<T>(in, inPtr, in.elements() * sizeof(T));
+    Array<int> pivot = createHostDataArray<int>(af::dim4(M), pivotPtr);
+
+    pivot.eval();
+
+    pinnedFree(inPtr);
+    pinnedFree(pivotPtr);
+
+    return pivot;
+}
+
+#define INSTANTIATE_LU(T)                                                                           \
+    template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
+    template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+INSTANTIATE_LU(float)
+INSTANTIATE_LU(cfloat)
+INSTANTIATE_LU(double)
+INSTANTIATE_LU(cdouble)
+
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/cpu_lu.hpp b/src/backend/cuda/cpu_lapack/cpu_lu.hpp
new file mode 100644
index 0000000000..39a638fbce
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_lu.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+    template<typename T>
+    void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+    template<typename T>
+    Array<int> lu_inplace(Array<T> &in, const bool convert_pivot = true);
+}
+}
diff --git a/src/backend/cuda/cpu_lapack/cpu_qr.cpp b/src/backend/cuda/cpu_lapack/cpu_qr.cpp
new file mode 100644
index 0000000000..853119ff16
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_qr.cpp
@@ -0,0 +1,160 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_CPU_LINEAR_ALGEBRA)
+
+#include <cpu_lapack/cpu_qr.hpp>
+#include <err_common.hpp>
+#include <af/dim4.hpp>
+#include <handle.hpp>
+#include <copy.hpp>
+#include <iostream>
+#include <cassert>
+
+#include <cpu_lapack/cpu_triangle.hpp>
+#include "lapack_helper.hpp"
+
+namespace cuda
+{
+namespace cpu
+{
+
+template<typename T>
+using geqrf_func_def = int (*)(ORDER_TYPE, int, int,
+                               T*, int,
+                               T*);
+
+template<typename T>
+using gqr_func_def = int (*)(ORDER_TYPE, int, int, int,
+                             T*, int,
+                             const T*);
+
+#define QR_FUNC_DEF( FUNC )                                         \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define QR_FUNC( FUNC, TYPE, PREFIX )                               \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()            \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+QR_FUNC_DEF( geqrf )
+QR_FUNC(geqrf , float  , s)
+QR_FUNC(geqrf , double , d)
+QR_FUNC(geqrf , cfloat , c)
+QR_FUNC(geqrf , cdouble, z)
+
+#define GQR_FUNC_DEF( FUNC )                                         \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+#define GQR_FUNC( FUNC, TYPE, PREFIX )                               \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()             \
+{ return & LAPACK_NAME(PREFIX); }
+
+GQR_FUNC_DEF( gqr )
+GQR_FUNC(gqr , float  , sorgqr)
+GQR_FUNC(gqr , double , dorgqr)
+GQR_FUNC(gqr , cfloat , cungqr)
+GQR_FUNC(gqr , cdouble, zungqr)
+
+template<typename T>
+void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    dim4 padDims(M, max(M, N));
+    q = padArray<T, T>(in, padDims, scalar<T>(0));
+    q.resetDims(iDims);
+
+    dim4 qdims = q.dims();
+
+    T *tPtr = NULL;
+    T *qPtr = NULL;
+    int nT = 0;
+    {
+        ///////////////////////////////////////////////
+        // QR Inplace on q
+        int M_ = qdims[0];
+        int N_ = qdims[1];
+        nT = min(M_, N_);
+
+        tPtr = pinnedAlloc<T>(nT);
+        qPtr = pinnedAlloc<T>(padDims.elements());
+        q.resetDims(padDims);
+        copyData(qPtr, q);
+        q.resetDims(iDims);
+
+        geqrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+                        qPtr, M,
+                        tPtr);
+        ///////////////////////////////////////////////
+    }
+
+    // SPLIT into q and r
+    dim4 rdims(M, N);
+    T *rPtr = pinnedAlloc<T>(rdims.elements());
+
+    dim4 rst(1, rdims[0], rdims[0] * rdims[1], rdims[0] * rdims[1] * rdims[2]);
+
+    triangle<T, true, false>(rPtr, qPtr, rdims, rst, q.strides());
+
+    gqr_func<T>()(AF_LAPACK_COL_MAJOR,
+                  M, M, min(M, N),
+                  qPtr, q.strides()[1],
+                  tPtr);
+
+    q.resetDims(dim4(M, M));
+
+    t = createHostDataArray<T>(af::dim4(nT), tPtr);
+    r = createHostDataArray<T>(rdims, rPtr);
+    writeHostDataArray<T>(q, qPtr, q.elements() * sizeof(T));
+
+    pinnedFree(tPtr);
+    pinnedFree(rPtr);
+    pinnedFree(qPtr);
+}
+
+template<typename T>
+Array<T> qr_inplace(Array<T> &in)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    T *tPtr  = pinnedAlloc<T>(min(M, N));
+    T *inPtr = pinnedAlloc<T>(in.elements());
+    copyData(inPtr, in);
+
+    geqrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+                    inPtr, in.strides()[1],
+                    tPtr);
+
+    writeHostDataArray<T>(in, inPtr, in.elements() * sizeof(T));
+    Array<T> t = createHostDataArray<T>(af::dim4(min(M, N)), tPtr);
+
+    pinnedFree(inPtr);
+    pinnedFree(tPtr);
+
+    return t;
+}
+
+#define INSTANTIATE_QR(T)                                                                           \
+    template Array<T> qr_inplace<T>(Array<T> &in);                                                \
+    template void qr<T>(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
+
+INSTANTIATE_QR(float)
+INSTANTIATE_QR(cfloat)
+INSTANTIATE_QR(double)
+INSTANTIATE_QR(cdouble)
+
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/cpu_qr.hpp b/src/backend/cuda/cpu_lapack/cpu_qr.hpp
new file mode 100644
index 0000000000..a7a628466d
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_qr.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+    template<typename T>
+    void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
+
+    template<typename T>
+    Array<T> qr_inplace(Array<T> &in);
+}
+}
diff --git a/src/backend/cuda/cpu_lapack/cpu_solve.cpp b/src/backend/cuda/cpu_lapack/cpu_solve.cpp
new file mode 100644
index 0000000000..c9d080321b
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_solve.cpp
@@ -0,0 +1,206 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_CPU_LINEAR_ALGEBRA)
+
+#include <cpu_lapack/cpu_solve.hpp>
+#include <err_common.hpp>
+
+#include <af/dim4.hpp>
+#include <handle.hpp>
+#include <iostream>
+#include <cassert>
+#include <err_cuda.hpp>
+
+#include "lapack_helper.hpp"
+
+namespace cuda
+{
+namespace cpu
+{
+
+template<typename T>
+using gesv_func_def = int (*)(ORDER_TYPE, int, int,
+                              T *, int,
+                              int *,
+                              T *, int);
+
+template<typename T>
+using gels_func_def = int (*)(ORDER_TYPE, char,
+                              int, int, int,
+                              T *, int,
+                              T *, int);
+
+template<typename T>
+using getrs_func_def = int (*)(ORDER_TYPE, char,
+                               int, int,
+                               const T *, int,
+                               const int *,
+                               T *, int);
+
+template<typename T>
+using trtrs_func_def = int (*)(ORDER_TYPE,
+                               char, char, char,
+                               int, int,
+                               const T *, int,
+                               T *, int);
+
+
+#define SOLVE_FUNC_DEF( FUNC )                                      \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define SOLVE_FUNC( FUNC, TYPE, PREFIX )                            \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()            \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+SOLVE_FUNC_DEF( gesv )
+SOLVE_FUNC(gesv , float  , s)
+SOLVE_FUNC(gesv , double , d)
+SOLVE_FUNC(gesv , cfloat , c)
+SOLVE_FUNC(gesv , cdouble, z)
+
+SOLVE_FUNC_DEF( gels )
+SOLVE_FUNC(gels , float  , s)
+SOLVE_FUNC(gels , double , d)
+SOLVE_FUNC(gels , cfloat , c)
+SOLVE_FUNC(gels , cdouble, z)
+
+SOLVE_FUNC_DEF( getrs )
+SOLVE_FUNC(getrs , float  , s)
+SOLVE_FUNC(getrs , double , d)
+SOLVE_FUNC(getrs , cfloat , c)
+SOLVE_FUNC(getrs , cdouble, z)
+
+SOLVE_FUNC_DEF( trtrs )
+SOLVE_FUNC(trtrs , float  , s)
+SOLVE_FUNC(trtrs , double , d)
+SOLVE_FUNC(trtrs , cfloat , c)
+SOLVE_FUNC(trtrs , cdouble, z)
+
+template<typename T>
+Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
+                 const Array<T> &b, const af_mat_prop options)
+{
+    int N = A.dims()[0];
+    int NRHS = b.dims()[1];
+
+    T *aPtr = pinnedAlloc<T>(A.elements());
+    T *bPtr = pinnedAlloc<T>(b.elements());
+    int *pPtr = pinnedAlloc<int>(pivot.elements());
+
+    copyData(aPtr, A);
+    copyData(bPtr, b);
+    copyData(pPtr, pivot);
+
+    getrs_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
+                    N, NRHS,
+                    aPtr, A.strides()[1],
+                    pPtr,
+                    bPtr, b.strides()[1]);
+
+    Array<T> B = createHostDataArray<T>(b.dims(), bPtr);
+
+    pinnedFree(aPtr);
+    pinnedFree(bPtr);
+    pinnedFree(pPtr);
+
+    return B;
+}
+
+template<typename T>
+Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options)
+{
+    int N = b.dims()[0];
+    int NRHS = b.dims()[1];
+
+    T *aPtr = pinnedAlloc<T>(A.elements());
+    T *bPtr = pinnedAlloc<T>(b.elements());
+    copyData(aPtr, A);
+    copyData(bPtr, b);
+
+    trtrs_func<T>()(AF_LAPACK_COL_MAJOR,
+                    options & AF_MAT_UPPER ? 'U' : 'L',
+                    'N', // transpose flag
+                    options & AF_MAT_DIAG_UNIT ? 'U' : 'N',
+                    N, NRHS,
+                    aPtr, A.strides()[1],
+                    bPtr, b.strides()[1]);
+
+    Array<T> B = createHostDataArray<T>(b.dims(), bPtr);
+
+    pinnedFree(aPtr);
+    pinnedFree(bPtr);
+
+    return B;
+}
+
+
+template<typename T>
+Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
+{
+
+    if (options & AF_MAT_UPPER ||
+        options & AF_MAT_LOWER) {
+        return triangleSolve<T>(a, b, options);
+    }
+
+    int M = a.dims()[0];
+    int N = a.dims()[1];
+    int K = b.dims()[1];
+
+    Array<T> B = padArray<T, T>(b, dim4(max(M, N), K), scalar<T>(0));
+
+    T *aPtr = pinnedAlloc<T>(a.elements());
+    T *bPtr = pinnedAlloc<T>(B.elements());
+    copyData(aPtr, a);
+    copyData(bPtr, B);
+
+    if(M == N) {
+        int *pivotPtr  = pinnedAlloc<int>(N);
+        gesv_func<T>()(AF_LAPACK_COL_MAJOR, N, K,
+                       aPtr, a.strides()[1],
+                       pivotPtr,
+                       bPtr, B.strides()[1]);
+        pinnedFree(pivotPtr);
+
+        writeHostDataArray<T>(B, bPtr, B.elements() * sizeof(T));
+    } else {
+        int sM = a.strides()[1];
+        int sN = a.strides()[2] / sM;
+
+        gels_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
+                       M, N, K,
+                       aPtr, a.strides()[1],
+                       bPtr, max(sM, sN));
+        writeHostDataArray<T>(B, bPtr, B.elements() * sizeof(T));
+        B.resetDims(dim4(N, K));
+    }
+
+    pinnedFree(aPtr);
+    pinnedFree(bPtr);
+
+    return B;
+}
+
+#define INSTANTIATE_SOLVE(T)                                            \
+    template Array<T> solve<T>(const Array<T> &a, const Array<T> &b,    \
+                               const af_mat_prop options);              \
+    template Array<T> solveLU<T>(const Array<T> &A, const Array<int> &pivot, \
+                                 const Array<T> &b, const af_mat_prop options); \
+
+INSTANTIATE_SOLVE(float)
+INSTANTIATE_SOLVE(cfloat)
+INSTANTIATE_SOLVE(double)
+INSTANTIATE_SOLVE(cdouble)
+
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/cpu_solve.hpp b/src/backend/cuda/cpu_lapack/cpu_solve.hpp
new file mode 100644
index 0000000000..64a1ef3d44
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_solve.hpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+    template<typename T>
+    Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options = AF_MAT_NONE);
+
+    template<typename T>
+    Array<T> solveLU(const Array<T> &a, const Array<int> &pivot,
+                     const Array<T> &b, const af_mat_prop options = AF_MAT_NONE);
+}
+}
diff --git a/src/backend/cuda/cpu_lapack/cpu_svd.cpp b/src/backend/cuda/cpu_lapack/cpu_svd.cpp
new file mode 100644
index 0000000000..eb71606ee4
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_svd.cpp
@@ -0,0 +1,153 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined(WITH_CPU_LINEAR_ALGEBRA)
+#include <cpu_lapack/cpu_svd.hpp>
+
+#include <Array.hpp>
+#include <svd.hpp>
+#include <err_common.hpp>
+#include <copy.hpp>
+
+#include "lapack_helper.hpp"
+
+namespace cuda
+{
+namespace cpu
+{
+
+#define SVD_FUNC_DEF( FUNC )                                            \
+    template<typename T,typename Tr> svd_func_def<T, Tr> svd_func();
+
+#define SVD_FUNC( FUNC, T, Tr, PREFIX )                     \
+    template<> svd_func_def<T, Tr>     svd_func<T, Tr>()    \
+    { return & LAPACK_NAME(PREFIX##FUNC); }
+
+#if defined(USE_MKL) || defined(__APPLE__)
+
+    template<typename T, typename Tr>
+    using svd_func_def = int (*)(ORDER_TYPE,
+                                 char jobz,
+                                 int m, int n,
+                                 T* in, int ldin,
+                                 Tr* s,
+                                 T* u, int ldu,
+                                 T* vt, int ldvt);
+
+    SVD_FUNC_DEF( gesdd )
+    SVD_FUNC(gesdd, float  , float , s)
+    SVD_FUNC(gesdd, double , double, d)
+    SVD_FUNC(gesdd, cfloat , float , c)
+    SVD_FUNC(gesdd, cdouble, double, z)
+
+#else   // Atlas causes memory freeing issues with using gesdd
+
+    template<typename T, typename Tr>
+    using svd_func_def = int (*)(ORDER_TYPE,
+                                 char jobu, char jobvt,
+                                 int m, int n,
+                                 T* in, int ldin,
+                                 Tr* s,
+                                 T* u, int ldu,
+                                 T* vt, int ldvt,
+                                 Tr *superb);
+
+    SVD_FUNC_DEF( gesvd )
+    SVD_FUNC(gesvd, float  , float , s)
+    SVD_FUNC(gesvd, double , double, d)
+    SVD_FUNC(gesvd, cfloat , float , c)
+    SVD_FUNC(gesvd, cdouble, double, z)
+
+#endif
+
+    template <typename T, typename Tr>
+    void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+    {
+        dim4 iDims = in.dims();
+        int M = iDims[0];
+        int N = iDims[1];
+
+        // S, U, Vt are empty. Simply write to them
+        Tr *sPtr = pinnedAlloc<Tr>(s.elements());
+        T  *uPtr = pinnedAlloc<T >(u.elements());
+        T  *vPtr = pinnedAlloc<T >(vt.elements());
+        T  *iPtr = pinnedAlloc<T >(in.elements());
+
+        copyData(sPtr, s);
+        copyData(uPtr, u);
+        copyData(vPtr, vt);
+        copyData(iPtr, in);
+
+#if defined(USE_MKL) || defined(__APPLE__)
+        svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', M, N, iPtr, in.strides()[1],
+                          sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1]);
+#else
+        std::vector<Tr> superb(std::min(M, N));
+        svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, iPtr, in.strides()[1],
+                          sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1], &superb[0]);
+#endif
+        writeHostDataArray(s , sPtr, s.elements()  * sizeof(Tr));
+        writeHostDataArray(u , uPtr, u.elements()  * sizeof(T ));
+        writeHostDataArray(vt, vPtr, vt.elements() * sizeof(T ));
+        writeHostDataArray(in, iPtr, in.elements() * sizeof(T ));
+
+        pinnedFree(sPtr);
+        pinnedFree(uPtr);
+        pinnedFree(vPtr);
+        pinnedFree(iPtr);
+    }
+
+    template <typename T, typename Tr>
+    void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+    {
+        dim4 iDims = in.dims();
+        int M = iDims[0];
+        int N = iDims[1];
+
+        // S, U, Vt are empty. Simply write to them
+        Tr *sPtr = pinnedAlloc<Tr>(s.elements());
+        T  *uPtr = pinnedAlloc<T >(u.elements());
+        T  *vPtr = pinnedAlloc<T >(vt.elements());
+        T  *iPtr = pinnedAlloc<T >(in.elements());
+
+        copyData(sPtr, s);
+        copyData(uPtr, u);
+        copyData(vPtr, vt);
+        copyData(iPtr, in);
+
+#if defined(USE_MKL) || defined(__APPLE__)
+        svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', M, N, iPtr, in.strides()[1],
+                          sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1]);
+#else
+        std::vector<Tr> superb(std::min(M, N));
+        svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, iPtr, in.strides()[1],
+                          sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1], &superb[0]);
+#endif
+        writeHostDataArray(s , sPtr, s.elements()  * sizeof(Tr));
+        writeHostDataArray(u , uPtr, u.elements()  * sizeof(T ));
+        writeHostDataArray(vt, vPtr, vt.elements() * sizeof(T ));
+
+        pinnedFree(sPtr);
+        pinnedFree(uPtr);
+        pinnedFree(vPtr);
+        pinnedFree(iPtr);
+    }
+
+#define INSTANTIATE_SVD(T, Tr)                                          \
+    template void svd<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, const Array<T> &in); \
+    template void svdInPlace<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, Array<T> &in);
+
+    INSTANTIATE_SVD(float  , float )
+    INSTANTIATE_SVD(double , double)
+    INSTANTIATE_SVD(cfloat , float )
+    INSTANTIATE_SVD(cdouble, double)
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/cpu_svd.hpp b/src/backend/cuda/cpu_lapack/cpu_svd.hpp
new file mode 100644
index 0000000000..f5fc1a8e9c
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_svd.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+    template<typename T, typename Tr>
+    void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in);
+
+    template<typename T, typename Tr>
+    void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in);
+}
+}
diff --git a/src/backend/cuda/cpu_lapack/cpu_triangle.hpp b/src/backend/cuda/cpu_lapack/cpu_triangle.hpp
new file mode 100644
index 0000000000..fb8fea1fae
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_triangle.hpp
@@ -0,0 +1,52 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#ifndef CPU_LAPACK_TRIANGLE
+#define CPU_LAPACK_TRIANGLE
+namespace cuda
+{
+namespace cpu
+{
+
+template<typename T, bool is_upper, bool is_unit_diag>
+void triangle(T *o, const T *i, const dim4 odm, const dim4 ost, const dim4 ist)
+{
+    for(dim_t ow = 0; ow < odm[3]; ow++) {
+        const dim_t oW = ow * ost[3];
+        const dim_t iW = ow * ist[3];
+
+        for(dim_t oz = 0; oz < odm[2]; oz++) {
+            const dim_t oZW = oW + oz * ost[2];
+            const dim_t iZW = iW + oz * ist[2];
+
+            for(dim_t oy = 0; oy < odm[1]; oy++) {
+                const dim_t oYZW = oZW + oy * ost[1];
+                const dim_t iYZW = iZW + oy * ist[1];
+
+                for(dim_t ox = 0; ox < odm[0]; ox++) {
+                    const dim_t oMem = oYZW + ox;
+                    const dim_t iMem = iYZW + ox;
+
+                    bool cond = is_upper ? (oy >= ox) : (oy <= ox);
+                    bool do_unit_diag = (is_unit_diag && ox == oy);
+                    if(cond) {
+                        o[oMem] = do_unit_diag ? scalar<T>(1) : i[iMem];
+                    } else {
+                        o[oMem] = scalar<T>(0);
+                    }
+                }
+            }
+        }
+    }
+}
+
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/lapack_helper.hpp b/src/backend/cuda/cpu_lapack/lapack_helper.hpp
new file mode 100644
index 0000000000..58265871c2
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/lapack_helper.hpp
@@ -0,0 +1,35 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#ifndef AFCPU_LAPACK
+#define AFCPU_LAPACK
+
+#include <types.hpp>
+
+#define lapack_complex_float cuda::cfloat
+#define lapack_complex_double cuda::cdouble
+#define LAPACK_PREFIX LAPACKE_
+#define ORDER_TYPE int
+#define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR
+#define LAPACK_NAME(fn) LAPACKE_##fn
+
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#include <lapacke.hpp>
+#undef AF_LAPACK_COL_MAJOR
+#define AF_LAPACK_COL_MAJOR 0
+#else
+#ifdef USE_MKL
+#include<mkl_lapacke.h>
+#else // NETLIB LAPACKE
+#include<lapacke.h>
+#endif
+#endif
+
+#endif
diff --git a/src/backend/cuda/diagonal.cu b/src/backend/cuda/diagonal.cu
index 05b8025de5..fd023c9f16 100644
--- a/src/backend/cuda/diagonal.cu
+++ b/src/backend/cuda/diagonal.cu
@@ -56,5 +56,7 @@ namespace cuda
     INSTANTIATE_DIAGONAL(uintl)
     INSTANTIATE_DIAGONAL(char)
     INSTANTIATE_DIAGONAL(uchar)
+    INSTANTIATE_DIAGONAL(short)
+    INSTANTIATE_DIAGONAL(ushort)
 
 }
diff --git a/src/backend/cuda/diff.cu b/src/backend/cuda/diff.cu
index a50ba2652c..96135f93f8 100644
--- a/src/backend/cuda/diff.cu
+++ b/src/backend/cuda/diff.cu
@@ -70,5 +70,7 @@ namespace cuda
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/dilate.cu b/src/backend/cuda/dilate.cu
index 0da33f2969..9115ba8f63 100644
--- a/src/backend/cuda/dilate.cu
+++ b/src/backend/cuda/dilate.cu
@@ -18,5 +18,7 @@ INSTANTIATE(char  , true)
 INSTANTIATE(int   , true)
 INSTANTIATE(uint  , true)
 INSTANTIATE(uchar , true)
+INSTANTIATE(short , true)
+INSTANTIATE(ushort, true)
 
 }
diff --git a/src/backend/cuda/dilate3d.cu b/src/backend/cuda/dilate3d.cu
index 32b0babc9d..4846e40ad9 100644
--- a/src/backend/cuda/dilate3d.cu
+++ b/src/backend/cuda/dilate3d.cu
@@ -18,5 +18,7 @@ INSTANTIATE(char  , true)
 INSTANTIATE(int   , true)
 INSTANTIATE(uint  , true)
 INSTANTIATE(uchar , true)
+INSTANTIATE(short , true)
+INSTANTIATE(ushort, true)
 
 }
diff --git a/src/backend/cuda/erode.cu b/src/backend/cuda/erode.cu
index dbb2c8ece9..25ca46c129 100644
--- a/src/backend/cuda/erode.cu
+++ b/src/backend/cuda/erode.cu
@@ -18,5 +18,7 @@ INSTANTIATE(char  , false)
 INSTANTIATE(int   , false)
 INSTANTIATE(uint  , false)
 INSTANTIATE(uchar , false)
+INSTANTIATE(short , false)
+INSTANTIATE(ushort, false)
 
 }
diff --git a/src/backend/cuda/erode3d.cu b/src/backend/cuda/erode3d.cu
index 808198a455..c54b301ba5 100644
--- a/src/backend/cuda/erode3d.cu
+++ b/src/backend/cuda/erode3d.cu
@@ -18,5 +18,7 @@ INSTANTIATE(char  , false)
 INSTANTIATE(int   , false)
 INSTANTIATE(uint  , false)
 INSTANTIATE(uchar , false)
+INSTANTIATE(short , false)
+INSTANTIATE(ushort, false)
 
 }
diff --git a/src/backend/cuda/fast.cu b/src/backend/cuda/fast.cu
index 7bd6f4772a..53741e3bf5 100644
--- a/src/backend/cuda/fast.cu
+++ b/src/backend/cuda/fast.cu
@@ -59,5 +59,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/fast_pyramid.cu b/src/backend/cuda/fast_pyramid.cu
index 3c2223683f..1e1b047d2d 100644
--- a/src/backend/cuda/fast_pyramid.cu
+++ b/src/backend/cuda/fast_pyramid.cu
@@ -50,5 +50,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/fftconvolve.cu b/src/backend/cuda/fftconvolve.cu
index 97edeec2ff..57fcb1071d 100644
--- a/src/backend/cuda/fftconvolve.cu
+++ b/src/backend/cuda/fftconvolve.cu
@@ -119,5 +119,9 @@ INSTANTIATE(uint  , float,  cfloat,  false, true)
 INSTANTIATE(int   , float,  cfloat,  false, true)
 INSTANTIATE(uchar , float,  cfloat,  false, true)
 INSTANTIATE(char  , float,  cfloat,  false, true)
+INSTANTIATE(ushort, float,  cfloat,  false, true)
+INSTANTIATE(short , float,  cfloat,  false, true)
+INSTANTIATE(uintl , float,  cfloat,  false, true)
+INSTANTIATE(intl  , float,  cfloat,  false, true)
 
 }
diff --git a/src/backend/cuda/hist_graphics.cu b/src/backend/cuda/hist_graphics.cu
index 69cb22c540..2ce0c199de 100644
--- a/src/backend/cuda/hist_graphics.cu
+++ b/src/backend/cuda/hist_graphics.cu
@@ -46,6 +46,8 @@ void copy_histogram(const Array<T> &data, const fg::Histogram* hist)
 INSTANTIATE(float)
 INSTANTIATE(int)
 INSTANTIATE(uint)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 INSTANTIATE(uchar)
 
 }
diff --git a/src/backend/cuda/histogram.cu b/src/backend/cuda/histogram.cu
index 0f99982ef2..d17d390cdf 100644
--- a/src/backend/cuda/histogram.cu
+++ b/src/backend/cuda/histogram.cu
@@ -47,5 +47,9 @@ INSTANTIATE(char  , uint)
 INSTANTIATE(int   , uint)
 INSTANTIATE(uint  , uint)
 INSTANTIATE(uchar , uint)
+INSTANTIATE(short , uint)
+INSTANTIATE(ushort, uint)
+INSTANTIATE(intl  , uint)
+INSTANTIATE(uintl , uint)
 
 }
diff --git a/src/backend/cuda/homography.cu b/src/backend/cuda/homography.cu
new file mode 100644
index 0000000000..a7a993aa4f
--- /dev/null
+++ b/src/backend/cuda/homography.cu
@@ -0,0 +1,79 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_cuda.hpp>
+#include <handle.hpp>
+#include <arith.hpp>
+#include <random.hpp>
+#include <kernel/homography.hpp>
+#include <algorithm>
+
+#include <iostream>
+#include <cfloat>
+
+using af::dim4;
+
+namespace cuda
+{
+
+#define RANSACConfidence 0.99f
+#define LMEDSConfidence 0.99f
+#define LMEDSOutlierRatio 0.4f
+
+template<typename T>
+int homography(Array<T> &bestH,
+               const Array<float> &x_src,
+               const Array<float> &y_src,
+               const Array<float> &x_dst,
+               const Array<float> &y_dst,
+               const af_homography_type htype,
+               const float inlier_thr,
+               const unsigned iterations)
+{
+    const af::dim4 idims = x_src.dims();
+    const unsigned nsamples = idims[0];
+
+    unsigned iter = iterations;
+    Array<float> err = createEmptyArray<float>(af::dim4());
+    if (htype == AF_HOMOGRAPHY_LMEDS) {
+        iter = ::std::min(iter, (unsigned)(log(1.f - LMEDSConfidence) / log(1.f - pow(1.f - LMEDSOutlierRatio, 4.f))));
+        err = createValueArray<float>(af::dim4(nsamples, iter), FLT_MAX);
+    }
+
+    af::dim4 rdims(4, iter);
+    Array<float> frnd = randu<float>(rdims);
+    Array<float> fctr = createValueArray<float>(rdims, (float)nsamples);
+    Array<float> rnd = arithOp<float, af_mul_t>(frnd, fctr, rdims);
+
+    Array<T> tmpH = createValueArray<T>(af::dim4(9, iter), (T)0);
+    Array<T> tmpA = createValueArray<T>(af::dim4(9, 9, iter), (T)0);
+    Array<T> tmpV = createValueArray<T>(af::dim4(9, 9, iter), (T)0);
+
+    bestH = createValueArray<T>(af::dim4(3, 3), (T)0);
+
+    return kernel::computeH<T>(bestH, tmpH, tmpA, tmpV, err,
+                               x_src, y_src, x_dst, y_dst,
+                               rnd, iter, nsamples, inlier_thr, htype);
+}
+
+#define INSTANTIATE(T)                                                                  \
+    template int homography<T>(Array<T> &H,                                             \
+                               const Array<float> &x_src, const Array<float> &y_src,    \
+                               const Array<float> &x_dst, const Array<float> &y_dst,    \
+                               const af_homography_type htype, const float inlier_thr,  \
+                               const unsigned iterations);
+
+INSTANTIATE(float )
+INSTANTIATE(double)
+
+}
diff --git a/src/backend/cuda/homography.hpp b/src/backend/cuda/homography.hpp
new file mode 100644
index 0000000000..514040e296
--- /dev/null
+++ b/src/backend/cuda/homography.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+
+template<typename T>
+int homography(Array<T> &H,
+               const Array<float> &x_src, const Array<float> &y_src,
+               const Array<float> &x_dst, const Array<float> &y_dst,
+               const af_homography_type htype, const float inlier_thr,
+               const unsigned iterations);
+
+}
diff --git a/src/backend/cuda/identity.cu b/src/backend/cuda/identity.cu
index 264d5b8a5e..6765766237 100644
--- a/src/backend/cuda/identity.cu
+++ b/src/backend/cuda/identity.cu
@@ -38,5 +38,7 @@ namespace cuda
     INSTANTIATE_IDENTITY(uintl)
     INSTANTIATE_IDENTITY(char)
     INSTANTIATE_IDENTITY(uchar)
+    INSTANTIATE_IDENTITY(short)
+    INSTANTIATE_IDENTITY(ushort)
 
 }
diff --git a/src/backend/cuda/image.cu b/src/backend/cuda/image.cu
index 7370fb2702..a99c79207d 100644
--- a/src/backend/cuda/image.cu
+++ b/src/backend/cuda/image.cu
@@ -53,6 +53,8 @@ INSTANTIATE(int)
 INSTANTIATE(uint)
 INSTANTIATE(uchar)
 INSTANTIATE(char)
+INSTANTIATE(ushort)
+INSTANTIATE(short)
 
 }
 
diff --git a/src/backend/cuda/index.cu b/src/backend/cuda/index.cu
index 988f589ddb..b1d528c4da 100644
--- a/src/backend/cuda/index.cu
+++ b/src/backend/cuda/index.cu
@@ -75,11 +75,13 @@ INSTANTIATE(cdouble)
 INSTANTIATE(double )
 INSTANTIATE(cfloat )
 INSTANTIATE(float  )
-INSTANTIATE(uintl  )
 INSTANTIATE(uint   )
-INSTANTIATE(intl   )
 INSTANTIATE(int    )
+INSTANTIATE(uintl  )
+INSTANTIATE(intl   )
 INSTANTIATE(uchar  )
 INSTANTIATE(char   )
+INSTANTIATE(ushort )
+INSTANTIATE(short  )
 
 }
diff --git a/src/backend/cuda/interopManager.cu b/src/backend/cuda/interopManager.cu
index fe13b46e8b..b492a5ee1d 100644
--- a/src/backend/cuda/interopManager.cu
+++ b/src/backend/cuda/interopManager.cu
@@ -82,6 +82,23 @@ cudaGraphicsResource* InteropManager::getBufferResource(const fg::Plot* key)
     return interop_maps[device][key_value];
 }
 
+cudaGraphicsResource* InteropManager::getBufferResource(const fg::Plot3* key)
+{
+    int device = getActiveDeviceId();
+    void* key_value = (void*)key;
+
+    iter_t iter = interop_maps[device].find(key_value);
+
+    if(interop_maps[device].find(key_value) == interop_maps[device].end()) {
+        cudaGraphicsResource *cudaVBOResource;
+        // Register VBO with CUDA
+        CUDA_CHECK(cudaGraphicsGLRegisterBuffer(&cudaVBOResource, key->vbo(), cudaGraphicsMapFlagsWriteDiscard));
+        interop_maps[device][key_value] = cudaVBOResource;
+    }
+
+    return interop_maps[device][key_value];
+}
+
 cudaGraphicsResource* InteropManager::getBufferResource(const fg::Histogram* key)
 {
     int device = getActiveDeviceId();
@@ -99,6 +116,23 @@ cudaGraphicsResource* InteropManager::getBufferResource(const fg::Histogram* key
     return interop_maps[device][key_value];
 }
 
+cudaGraphicsResource* InteropManager::getBufferResource(const fg::Surface* key)
+{
+    int device = getActiveDeviceId();
+    void* key_value = (void*)key;
+
+    iter_t iter = interop_maps[device].find(key_value);
+
+    if(interop_maps[device].find(key_value) == interop_maps[device].end()) {
+        cudaGraphicsResource *cudaVBOResource;
+        // Register VBO with CUDA
+        CUDA_CHECK(cudaGraphicsGLRegisterBuffer(&cudaVBOResource, key->vbo(), cudaGraphicsMapFlagsWriteDiscard));
+        interop_maps[device][key_value] = cudaVBOResource;
+    }
+
+    return interop_maps[device][key_value];
+}
+
 }
 
 #endif
diff --git a/src/backend/cuda/interopManager.hpp b/src/backend/cuda/interopManager.hpp
index f6d3904eb5..e586d384a1 100644
--- a/src/backend/cuda/interopManager.hpp
+++ b/src/backend/cuda/interopManager.hpp
@@ -40,7 +40,9 @@ class InteropManager
         ~InteropManager();
         cudaGraphicsResource* getBufferResource(const fg::Image* handle);
         cudaGraphicsResource* getBufferResource(const fg::Plot* handle);
+        cudaGraphicsResource* getBufferResource(const fg::Plot3* handle);
         cudaGraphicsResource* getBufferResource(const fg::Histogram* handle);
+        cudaGraphicsResource* getBufferResource(const fg::Surface* handle);
 
     protected:
         InteropManager() {}
diff --git a/src/backend/cuda/inverse.cu b/src/backend/cuda/inverse.cu
index 96295f39ac..7b2ae3b17d 100644
--- a/src/backend/cuda/inverse.cu
+++ b/src/backend/cuda/inverse.cu
@@ -36,6 +36,28 @@ INSTANTIATE(cdouble)
 
 }
 
+#elif defined(WITH_CPU_LINEAR_ALGEBRA)
+#include <cpu_lapack/cpu_inverse.hpp>
+
+namespace cuda
+{
+
+template<typename T>
+Array<T> inverse(const Array<T> &in)
+{
+    return cpu::inverse(in);
+}
+
+#define INSTANTIATE(T)                                                                   \
+    template Array<T> inverse<T> (const Array<T> &in);
+
+INSTANTIATE(float)
+INSTANTIATE(cfloat)
+INSTANTIATE(double)
+INSTANTIATE(cdouble)
+
+}
+
 #else
 namespace cuda
 {
diff --git a/src/backend/cuda/iota.cu b/src/backend/cuda/iota.cu
index ee9bcdccd4..eee4344d4d 100644
--- a/src/backend/cuda/iota.cu
+++ b/src/backend/cuda/iota.cu
@@ -37,5 +37,7 @@ namespace cuda
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
 
diff --git a/src/backend/cuda/ireduce.cu b/src/backend/cuda/ireduce.cu
index 0c14a01248..dece64c8af 100644
--- a/src/backend/cuda/ireduce.cu
+++ b/src/backend/cuda/ireduce.cu
@@ -51,6 +51,8 @@ namespace cuda
     INSTANTIATE(af_min_t, uint   )
     INSTANTIATE(af_min_t, intl   )
     INSTANTIATE(af_min_t, uintl  )
+    INSTANTIATE(af_min_t, short  )
+    INSTANTIATE(af_min_t, ushort )
     INSTANTIATE(af_min_t, char   )
     INSTANTIATE(af_min_t, uchar  )
 
@@ -63,6 +65,8 @@ namespace cuda
     INSTANTIATE(af_max_t, uint   )
     INSTANTIATE(af_max_t, intl   )
     INSTANTIATE(af_max_t, uintl  )
+    INSTANTIATE(af_max_t, short  )
+    INSTANTIATE(af_max_t, ushort )
     INSTANTIATE(af_max_t, char   )
     INSTANTIATE(af_max_t, uchar  )
 }
diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp
index b001fef60d..af5f2d6b68 100644
--- a/src/backend/cuda/jit.cpp
+++ b/src/backend/cuda/jit.cpp
@@ -500,6 +500,8 @@ template void evalNodes<char   >(Param<char   > &out, Node *node);
 template void evalNodes<uchar  >(Param<uchar  > &out, Node *node);
 template void evalNodes<intl   >(Param<intl   > &out, Node *node);
 template void evalNodes<uintl  >(Param<uintl  > &out, Node *node);
+template void evalNodes<short  >(Param<short  > &out, Node *node);
+template void evalNodes<ushort >(Param<ushort > &out, Node *node);
 
 
 }
diff --git a/src/backend/cuda/join.cu b/src/backend/cuda/join.cu
index 074326e167..729cec4c3f 100644
--- a/src/backend/cuda/join.cu
+++ b/src/backend/cuda/join.cu
@@ -170,16 +170,18 @@ namespace cuda
 #define INSTANTIATE(Tx, Ty)                                                                             \
     template Array<Tx> join<Tx, Ty>(const int dim, const Array<Tx> &first, const Array<Ty> &second);   \
 
-    INSTANTIATE(float,   float)
-    INSTANTIATE(double,  double)
-    INSTANTIATE(cfloat,  cfloat)
+    INSTANTIATE(float  , float  )
+    INSTANTIATE(double , double )
+    INSTANTIATE(cfloat , cfloat )
     INSTANTIATE(cdouble, cdouble)
-    INSTANTIATE(int,     int)
-    INSTANTIATE(uint,    uint)
-    INSTANTIATE(intl,    intl)
-    INSTANTIATE(uintl,   uintl)
-    INSTANTIATE(uchar,   uchar)
-    INSTANTIATE(char,    char)
+    INSTANTIATE(int    , int    )
+    INSTANTIATE(uint   , uint   )
+    INSTANTIATE(intl   , intl   )
+    INSTANTIATE(uintl  , uintl  )
+    INSTANTIATE(short  , short  )
+    INSTANTIATE(ushort , ushort )
+    INSTANTIATE(uchar  , uchar  )
+    INSTANTIATE(char   , char   )
 
 #undef INSTANTIATE
 
@@ -194,6 +196,8 @@ namespace cuda
     INSTANTIATE(uint)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
 
diff --git a/src/backend/cuda/kernel/approx.hpp b/src/backend/cuda/kernel/approx.hpp
index 6c9dd7de12..b1437ba201 100644
--- a/src/backend/cuda/kernel/approx.hpp
+++ b/src/backend/cuda/kernel/approx.hpp
@@ -27,13 +27,14 @@ namespace cuda
         ///////////////////////////////////////////////////////////////////////////
         template<typename Ty, typename Tp>
         __device__ inline static
-        void core_nearest1(const int idx, const int idy, const int idz, const int idw,
+        void core_nearest1(const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw,
                            Param<Ty> out, CParam<Ty> in, CParam<Tp> pos,
-                           const float offGrid)
+                           const float offGrid, const bool pBatch)
         {
-            const int omId = idw * out.strides[3] + idz * out.strides[2]
-                                + idy * out.strides[1] + idx;
-            const int pmId = idx;
+            const dim_t omId = idw * out.strides[3] + idz * out.strides[2]
+                             + idy * out.strides[1] + idx;
+            dim_t pmId = idx;
+            if(pBatch) pmId += idw * pos.strides[3] + idz * pos.strides[2] + idy * pos.strides[1];
 
             const Tp x = pos.ptr[pmId];
             if (x < 0 || in.dims[0] < x+1) {
@@ -41,8 +42,8 @@ namespace cuda
                 return;
             }
 
-            int ioff = idw * in.strides[3] + idz * in.strides[2] + idy * in.strides[1];
-            const int iMem = round(x) + ioff;
+            dim_t ioff = idw * in.strides[3] + idz * in.strides[2] + idy * in.strides[1];
+            const dim_t iMem = round(x) + ioff;
 
             Ty yt = in.ptr[iMem];
             out.ptr[omId] = yt;
@@ -50,14 +51,18 @@ namespace cuda
 
         template<typename Ty, typename Tp>
         __device__ inline static
-        void core_nearest2(const int idx, const int idy, const int idz, const int idw,
+        void core_nearest2(const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw,
                            Param<Ty> out, CParam<Ty> in,
-                           CParam<Tp> pos, CParam<Tp> qos, const float offGrid)
+                           CParam<Tp> pos, CParam<Tp> qos, const float offGrid, const bool pBatch)
         {
-            const int omId = idw * out.strides[3] + idz * out.strides[2]
-                                + idy * out.strides[1] + idx;
-            const int pmId = idy * pos.strides[1] + idx;
-            const int qmId = idy * qos.strides[1] + idx;
+            const dim_t omId = idw * out.strides[3] + idz * out.strides[2]
+                             + idy * out.strides[1] + idx;
+            dim_t pmId = idy * pos.strides[1] + idx;
+            dim_t qmId = idy * qos.strides[1] + idx;
+            if(pBatch) {
+                pmId += idw * pos.strides[3] + idz * pos.strides[2];
+                qmId += idw * qos.strides[3] + idz * qos.strides[2];
+            }
 
             const Tp x = pos.ptr[pmId], y = qos.ptr[qmId];
             if (x < 0 || y < 0 || in.dims[0] < x+1 || in.dims[1] < y+1) {
@@ -65,9 +70,9 @@ namespace cuda
                 return;
             }
 
-            const int grid_x = round(x), grid_y = round(y); // nearest grid
-            const int imId = idw * in.strides[3] + idz * in.strides[2]
-                             + grid_y * in.strides[1] + grid_x;
+            const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
+            const dim_t imId = idw * in.strides[3] + idz * in.strides[2]
+                          + grid_y * in.strides[1] + grid_x;
 
             Ty val = in.ptr[imId];
             out.ptr[omId] = val;
@@ -78,13 +83,14 @@ namespace cuda
         ///////////////////////////////////////////////////////////////////////////
         template<typename Ty, typename Tp>
         __device__ inline static
-        void core_linear1(const int idx, const int idy, const int idz, const int idw,
+        void core_linear1(const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw,
                           Param<Ty> out, CParam<Ty> in, CParam<Tp> pos,
-                          const float offGrid)
+                          const float offGrid, const bool pBatch)
         {
-            const int omId = idw * out.strides[3] + idz * out.strides[2]
-                                + idy * out.strides[1] + idx;
-            const int pmId = idx;
+            const dim_t omId = idw * out.strides[3] + idz * out.strides[2]
+                             + idy * out.strides[1] + idx;
+            dim_t pmId = idx;
+            if(pBatch) pmId += idw * pos.strides[3] + idz * pos.strides[2] + idy * pos.strides[1];
 
             const Tp pVal = pos.ptr[pmId];
             if (pVal < 0 || in.dims[0] < pVal+1) {
@@ -92,10 +98,10 @@ namespace cuda
                 return;
             }
 
-            const int grid_x = floor(pVal);  // nearest grid
+            const dim_t grid_x = floor(pVal);  // nearest grid
             const Tp off_x = pVal - grid_x; // fractional offset
 
-            int ioff = idw * in.strides[3] + idz * in.strides[2] + idy * in.strides[1] + grid_x;
+            dim_t ioff = idw * in.strides[3] + idz * in.strides[2] + idy * in.strides[1] + grid_x;
 
             // Check if pVal and pVal + 1 are both valid indices
             bool cond = (pVal < in.dims[0] - 1);
@@ -111,14 +117,19 @@ namespace cuda
 
         template<typename Ty, typename Tp>
         __device__ inline static
-        void core_linear2(const int idx, const int idy, const int idz, const int idw,
+        void core_linear2(const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw,
                            Param<Ty> out, CParam<Ty> in,
-                           CParam<Tp> pos, CParam<Tp> qos, const float offGrid)
+                           CParam<Tp> pos, CParam<Tp> qos, const float offGrid, const bool pBatch)
         {
-            const int omId = idw * out.strides[3] + idz * out.strides[2]
-                                + idy * out.strides[1] + idx;
-            const int pmId = idy * pos.strides[1] + idx;
-            const int qmId = idy * qos.strides[1] + idx;
+            const dim_t omId = idw * out.strides[3] + idz * out.strides[2]
+                             + idy * out.strides[1] + idx;
+            dim_t pmId = idy * pos.strides[1] + idx;
+            dim_t qmId = idy * qos.strides[1] + idx;
+            if(pBatch) {
+                pmId += idw * pos.strides[3] + idz * pos.strides[2];
+                qmId += idw * qos.strides[3] + idz * qos.strides[2];
+            }
+
 
             const Tp x = pos.ptr[pmId], y = qos.ptr[qmId];
             if (x < 0 || y < 0 || in.dims[0] < x+1 || in.dims[1] < y+1) {
@@ -126,10 +137,10 @@ namespace cuda
                 return;
             }
 
-            const int grid_x = floor(x),   grid_y = floor(y);   // nearest grid
+            const dim_t grid_x = floor(x),   grid_y = floor(y);   // nearest grid
             const Tp off_x  = x - grid_x, off_y  = y - grid_y; // fractional offset
 
-            int ioff = idw * in.strides[3] + idz * in.strides[2] + grid_y * in.strides[1] + grid_x;
+            dim_t ioff = idw * in.strides[3] + idz * in.strides[2] + grid_y * in.strides[1] + grid_x;
 
             // Check if pVal and pVal + 1 are both valid indices
             bool condY = (y < in.dims[1] - 1);
@@ -161,14 +172,14 @@ namespace cuda
         template<typename Ty, typename Tp, af_interp_type method>
         __global__
         void approx1_kernel(Param<Ty> out, CParam<Ty> in, CParam<Tp> pos,
-                            const float offGrid, const int blocksMatX)
+                            const float offGrid, const dim_t blocksMatX, const bool pBatch)
         {
-            const int idw = blockIdx.y / out.dims[2];
-            const int idz = blockIdx.y - idw * out.dims[2];
+            const dim_t idw = blockIdx.y / out.dims[2];
+            const dim_t idz = blockIdx.y - idw * out.dims[2];
 
-            const int idy = blockIdx.x / blocksMatX;
-            const int blockIdx_x = blockIdx.x - idy * blocksMatX;
-            const int idx = blockIdx_x * blockDim.x + threadIdx.x;
+            const dim_t idy = blockIdx.x / blocksMatX;
+            const dim_t blockIdx_x = blockIdx.x - idy * blocksMatX;
+            const dim_t idx = blockIdx_x * blockDim.x + threadIdx.x;
 
             if (idx >= out.dims[0] || idy >= out.dims[1] ||
                 idz >= out.dims[2] || idw >= out.dims[3])
@@ -176,10 +187,10 @@ namespace cuda
 
             switch(method) {
                 case AF_INTERP_NEAREST:
-                    core_nearest1(idx, idy, idz, idw, out, in, pos, offGrid);
+                    core_nearest1(idx, idy, idz, idw, out, in, pos, offGrid, pBatch);
                     break;
                 case AF_INTERP_LINEAR:
-                    core_linear1(idx, idy, idz, idw, out, in, pos, offGrid);
+                    core_linear1(idx, idy, idz, idw, out, in, pos, offGrid, pBatch);
                     break;
                 default:
                     break;
@@ -190,16 +201,16 @@ namespace cuda
         __global__
         void approx2_kernel(Param<Ty> out, CParam<Ty> in,
                       CParam<Tp> pos, CParam<Tp> qos, const float offGrid,
-                      const int blocksMatX, const int blocksMatY)
+                      const dim_t blocksMatX, const dim_t blocksMatY, const bool pBatch)
         {
-            const int idz = blockIdx.x / blocksMatX;
-            const int idw = blockIdx.y / blocksMatY;
+            const dim_t idz = blockIdx.x / blocksMatX;
+            const dim_t idw = blockIdx.y / blocksMatY;
 
-            int blockIdx_x = blockIdx.x - idz * blocksMatX;
-            int blockIdx_y = blockIdx.y - idw * blocksMatY;
+            dim_t blockIdx_x = blockIdx.x - idz * blocksMatX;
+            dim_t blockIdx_y = blockIdx.y - idw * blocksMatY;
 
-            int idx = threadIdx.x + blockIdx_x * blockDim.x;
-            int idy = threadIdx.y + blockIdx_y * blockDim.y;
+            dim_t idx = threadIdx.x + blockIdx_x * blockDim.x;
+            dim_t idy = threadIdx.y + blockIdx_y * blockDim.y;
 
             if (idx >= out.dims[0] || idy >= out.dims[1] ||
                 idz >= out.dims[2] || idw >= out.dims[3])
@@ -207,10 +218,10 @@ namespace cuda
 
             switch(method) {
                 case AF_INTERP_NEAREST:
-                    core_nearest2(idx, idy, idz, idw, out, in, pos, qos, offGrid);
+                    core_nearest2(idx, idy, idz, idw, out, in, pos, qos, offGrid, pBatch);
                     break;
                 case AF_INTERP_LINEAR:
-                    core_linear2(idx, idy, idz, idw, out, in, pos, qos, offGrid);
+                    core_linear2(idx, idy, idz, idw, out, in, pos, qos, offGrid, pBatch);
                     break;
                 default:
                     break;
@@ -225,11 +236,13 @@ namespace cuda
                CParam<Tp> pos, const float offGrid)
         {
             dim3 threads(THREADS, 1, 1);
-            int blocksPerMat = divup(out.dims[0], threads.x);
+            dim_t blocksPerMat = divup(out.dims[0], threads.x);
             dim3 blocks(blocksPerMat * out.dims[1], out.dims[2] * out.dims[3]);
 
+            bool pBatch = !(pos.dims[1] == 1 && pos.dims[2] == 1 && pos.dims[3] == 1);
+
             CUDA_LAUNCH((approx1_kernel<Ty, Tp, method>), blocks, threads,
-                    out, in, pos, offGrid, blocksPerMat);
+                         out, in, pos, offGrid, blocksPerMat, pBatch);
             POST_LAUNCH_CHECK();
         }
 
@@ -238,12 +251,14 @@ namespace cuda
                     CParam<Tp> pos, CParam<Tp> qos, const float offGrid)
         {
             dim3 threads(TX, TY, 1);
-            int blocksPerMatX = divup(out.dims[0], threads.x);
-            int blocksPerMatY = divup(out.dims[1], threads.y);
+            dim_t blocksPerMatX = divup(out.dims[0], threads.x);
+            dim_t blocksPerMatY = divup(out.dims[1], threads.y);
             dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3]);
 
+            bool pBatch = !(pos.dims[2] == 1 && pos.dims[3] == 1);
+
             CUDA_LAUNCH((approx2_kernel<Ty, Tp, method>), blocks, threads,
-                    out, in, pos, qos, offGrid, blocksPerMatX, blocksPerMatY);
+                         out, in, pos, qos, offGrid, blocksPerMatX, blocksPerMatY, pBatch);
             POST_LAUNCH_CHECK();
         }
     }
diff --git a/src/backend/cuda/kernel/convolve.cu b/src/backend/cuda/kernel/convolve.cu
index 78790c339d..468ae2bf51 100644
--- a/src/backend/cuda/kernel/convolve.cu
+++ b/src/backend/cuda/kernel/convolve.cu
@@ -485,12 +485,12 @@ void convolve_nd(Param<T> out, CParam<T> signal, CParam<aT> filt, ConvolveBatchK
 }
 
 #define INSTANTIATE(T, aT)  \
-	template void convolve_nd<T, aT, 1, true >(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
-	template void convolve_nd<T, aT, 1, false>(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
-	template void convolve_nd<T, aT, 2, true >(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
-	template void convolve_nd<T, aT, 2, false>(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
-	template void convolve_nd<T, aT, 3, true >(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
-	template void convolve_nd<T, aT, 3, false>(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
+    template void convolve_nd<T, aT, 1, true >(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
+    template void convolve_nd<T, aT, 1, false>(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
+    template void convolve_nd<T, aT, 2, true >(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
+    template void convolve_nd<T, aT, 2, false>(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
+    template void convolve_nd<T, aT, 3, true >(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
+    template void convolve_nd<T, aT, 3, false>(Param<T> out, CParam<T> signal, CParam<aT> filter, ConvolveBatchKind kind);\
 
 
 INSTANTIATE(cdouble, cdouble)
@@ -501,6 +501,10 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(uintl  ,   float)
+INSTANTIATE(intl   ,   float)
 
 }
 
diff --git a/src/backend/cuda/kernel/convolve_separable.cu b/src/backend/cuda/kernel/convolve_separable.cu
index e2caec7e08..654ec09fbc 100644
--- a/src/backend/cuda/kernel/convolve_separable.cu
+++ b/src/backend/cuda/kernel/convolve_separable.cu
@@ -174,10 +174,10 @@ void convolve2(Param<T> out, CParam<T> signal, CParam<accType> filter)
 }
 
 #define INSTANTIATE(T, accType)                                         \
-	template void convolve2<T, accType, 0, true >(Param<T> out, CParam<T> signal, CParam<accType> filter); \
-	template void convolve2<T, accType, 0, false>(Param<T> out, CParam<T> signal, CParam<accType> filter); \
-	template void convolve2<T, accType, 1, true >(Param<T> out, CParam<T> signal, CParam<accType> filter); \
-	template void convolve2<T, accType, 1, false>(Param<T> out, CParam<T> signal, CParam<accType> filter); \
+    template void convolve2<T, accType, 0, true >(Param<T> out, CParam<T> signal, CParam<accType> filter); \
+    template void convolve2<T, accType, 0, false>(Param<T> out, CParam<T> signal, CParam<accType> filter); \
+    template void convolve2<T, accType, 1, true >(Param<T> out, CParam<T> signal, CParam<accType> filter); \
+    template void convolve2<T, accType, 1, false>(Param<T> out, CParam<T> signal, CParam<accType> filter); \
 
 
 INSTANTIATE(cdouble, cdouble)
@@ -188,6 +188,10 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(uintl  ,   float)
+INSTANTIATE(intl   ,   float)
 
 }
 
diff --git a/src/backend/cuda/kernel/fast.hpp b/src/backend/cuda/kernel/fast.hpp
index df4f406cd2..6d6b0e0992 100644
--- a/src/backend/cuda/kernel/fast.hpp
+++ b/src/backend/cuda/kernel/fast.hpp
@@ -87,6 +87,16 @@ unsigned max_val(const unsigned x, const unsigned y)
     return max(x, y);
 }
 inline __device__
+short max_val(const short x, const short y)
+{
+    return max(x, y);
+}
+inline __device__
+ushort max_val(const ushort x, const ushort y)
+{
+    return max(x, y);
+}
+inline __device__
 float max_val(const float x, const float y)
 {
     return fmax(x, y);
@@ -109,6 +119,16 @@ inline __device__ unsigned abs_diff(const unsigned x, const unsigned y)
     int i = (int)x - (int)y;
     return max(-i, i);
 }
+inline __device__ short abs_diff(const short x, const short y)
+{
+    short i = x - y;
+    return max(-i, i);
+}
+inline __device__ ushort abs_diff(const ushort x, const ushort y)
+{
+    int i = (int)x - (int)y;
+    return (ushort)max(-i, i);
+}
 inline __device__ float abs_diff(const float x, const float y)
 {
     return fabs(x - y);
@@ -445,7 +465,9 @@ void fast(unsigned* out_feat,
 
     // Dimensions of output array
     unsigned total;
-    CUDA_CHECK(cudaMemcpy(&total, d_total, sizeof(unsigned), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&total, d_total, sizeof(unsigned), cudaMemcpyDeviceToHost,
+                cuda::getStream(cuda::getActiveDeviceId())));
+    CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
     total = total < max_feat ? total : max_feat;
 
     if (total > 0) {
diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index d6228de382..44f98d92c1 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -216,7 +216,8 @@ void harris(unsigned* corners_out,
 
     int filter_elem = filter.strides[3] * filter.dims[3];
     filter.ptr = memAlloc<convAccT>(filter_elem);
-    CUDA_CHECK(cudaMemcpy(filter.ptr, h_filter, filter_elem * sizeof(convAccT), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(filter.ptr, h_filter, filter_elem * sizeof(convAccT),
+                cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
 
     delete[] h_filter;
 
@@ -305,7 +306,9 @@ void harris(unsigned* corners_out,
             in.dims[0], in.dims[1], d_responses, min_r, border_len, corner_lim);
 
     unsigned corners_found = 0;
-    CUDA_CHECK(cudaMemcpy(&corners_found, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&corners_found, d_corners_found, sizeof(unsigned),
+                cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+    CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
 
     memFree(d_responses);
     memFree(d_corners_found);
diff --git a/src/backend/cuda/kernel/homography.hpp b/src/backend/cuda/kernel/homography.hpp
new file mode 100644
index 0000000000..8dd179492c
--- /dev/null
+++ b/src/backend/cuda/kernel/homography.hpp
@@ -0,0 +1,699 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/defines.h>
+#include <dispatch.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <memory.hpp>
+#include "ireduce.hpp"
+#include "reduce.hpp"
+#include "sort.hpp"
+
+#include <cfloat>
+
+#include <iostream>
+
+namespace cuda
+{
+
+namespace kernel
+{
+
+template<typename T>
+__device__ T sq(T a)
+{
+    return a * a;
+}
+
+template<typename T>
+struct EPS
+{
+    __device__ T eps() { return FLT_EPSILON; }
+};
+
+template<>
+struct EPS<float>
+{
+    __device__ static float eps() { return FLT_EPSILON; }
+};
+
+template<>
+struct EPS<double>
+{
+    __device__ static double eps() { return DBL_EPSILON; }
+};
+
+#define RANSACConfidence 0.99f
+#define LMEDSConfidence 0.99f
+#define LMEDSOutlierRatio 0.4f
+
+
+template<typename T>
+__device__ void JacobiSVD(T* S, T* V, int m, int n)
+{
+    const int iterations = 30;
+
+    int tid_x = threadIdx.x;
+    int bsz_x = blockDim.x;
+    int tid_y = threadIdx.y;
+    int gid_y = blockIdx.y * blockDim.y + tid_y;
+
+    __shared__ T acc[512];
+    T* acc1 = acc;
+    T* acc2 = acc + 256;
+
+    __shared__ T s_S[16*81];
+    __shared__ T s_V[16*81];
+    __shared__ T d[16*9];
+
+    for (int i = 0; i <= 4; i++)
+        s_S[tid_y * 81 + i*bsz_x + tid_x] = S[gid_y * 81 + i*bsz_x + tid_x];
+    if (tid_x == 0)
+        s_S[tid_y * 81 + 80] = S[gid_y * 81 + 80];
+    __syncthreads();
+
+    // Copy first 80 elements
+    for (int i = 0; i <= 4; i++) {
+        T t = s_S[tid_y*81 + tid_x+i*bsz_x];
+        acc1[tid_y*bsz_x + tid_x] += t*t;
+    }
+    if (tid_x < 8)
+        acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+8];
+    __syncthreads();
+    if (tid_x < 4)
+        acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+4];
+    __syncthreads();
+    if (tid_x < 2)
+        acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+2];
+    __syncthreads();
+    if (tid_x < 1) {
+        // Copy last element
+        T t = s_S[tid_y*bsz_x + tid_x+80];
+        acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+1] + t*t;
+    }
+    __syncthreads();
+
+    if (tid_x < n)
+        d[tid_y*9 + tid_x] = acc1[tid_y*bsz_x + tid_x];
+
+    // V is initialized as an identity matrix
+    for (int i = 0; i <= 4; i++) {
+        s_V[tid_y*81 + i*bsz_x + tid_x] = 0;
+    }
+    __syncthreads();
+    if (tid_x < m)
+        s_V[tid_y*81 + tid_x*m + tid_x] = 1;
+    __syncthreads();
+
+    for (int it = 0; it < iterations; it++) {
+        bool converged = false;
+
+        for (int i = 0; i < n-1; i++) {
+            for (int j = i+1; j < n; j++) {
+                T* Si = s_S + tid_y*81 + i*m;
+                T* Sj = s_S + tid_y*81 + j*m;
+
+                T p = (T)0;
+                for (int k = 0; k < m; k++)
+                    p += Si[k]*Sj[k];
+
+                T c = 0, s = 0;
+
+                bool cond = (abs(p) > EPS<T>::eps()*sqrt(d[tid_y*9 + i]*d[tid_y*9 + j]));
+                if (cond) {
+                    T y = d[tid_y*9 + i] - d[tid_y*9 + j];
+                    T r = hypot(p*2, y);
+                    T r2 = r*2;
+                    if (y >= 0) {
+                        c = sqrt((r + y) / r2);
+                        s = p / (r2*c);
+                    }
+                    else {
+                        s = sqrt((r - y) / r2);
+                        c = p / (r2*s);
+                    }
+                }
+                __syncthreads();
+
+                if (cond && tid_x < m) {
+                    T t0 = c*Si[tid_x] + s*Sj[tid_x];
+                    T t1 = c*Sj[tid_x] - s*Si[tid_x];
+                    Si[tid_x] = t0;
+                    Sj[tid_x] = t1;
+
+                    acc1[tid_y*16 + tid_x] = t0*t0;
+                    acc2[tid_y*16 + tid_x] = t1*t1;
+                }
+                __syncthreads();
+
+                if (cond && tid_x < 4) {
+                    acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+4];
+                    acc2[tid_y*16 + tid_x] += acc2[tid_y*16 + tid_x+4];
+                }
+                __syncthreads();
+                if (cond && tid_x < 2) {
+                    acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+2];
+                    acc2[tid_y*16 + tid_x] += acc2[tid_y*16 + tid_x+2];
+                }
+                __syncthreads();
+                if (cond && tid_x < 1) {
+                    acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+1] + acc1[tid_y*16 + tid_x+8];
+                    acc2[tid_y*16 + tid_x] += acc2[tid_y*16 + tid_x+1] + acc2[tid_y*16 + tid_x+8];
+                }
+                __syncthreads();
+
+                if (cond && tid_x == 0) {
+                    d[tid_y*9 + i] = acc1[tid_y*16];
+                    d[tid_y*9 + j] = acc2[tid_y*16];
+                }
+                __syncthreads();
+
+                T* Vi = s_V + tid_y*81 + i*n;
+                T* Vj = s_V + tid_y*81 + j*n;
+
+                if (cond && tid_x < n) {
+                    T t0 = Vi[tid_x] * c + Vj[tid_x] * s;
+                    T t1 = Vj[tid_x] * c - Vi[tid_x] * s;
+
+                    Vi[tid_x] = t0;
+                    Vj[tid_x] = t1;
+                }
+                __syncthreads();
+
+                converged = true;
+            }
+            if (!converged)
+                break;
+        }
+    }
+    __syncthreads();
+
+    for (int i = 0; i <= 4; i++)
+        V[gid_y * 81 + tid_x+i*bsz_x] = s_V[tid_y * 81 + tid_x+i*bsz_x];
+    if (tid_x == 0)
+        V[gid_y * 81 + 80] = s_V[tid_y * 81 + 80];
+    __syncthreads();
+}
+
+__device__ bool computeMeanScale(
+    float* x_src_mean,
+    float* y_src_mean,
+    float* x_dst_mean,
+    float* y_dst_mean,
+    float* src_scale,
+    float* dst_scale,
+    float* src_pt_x,
+    float* src_pt_y,
+    float* dst_pt_x,
+    float* dst_pt_y,
+    CParam<float> x_src,
+    CParam<float> y_src,
+    CParam<float> x_dst,
+    CParam<float> y_dst,
+    CParam<float> rnd,
+    int i)
+{
+    const unsigned ridx = rnd.dims[0] * i;
+    unsigned r[4] = { (unsigned)rnd.ptr[ridx],
+                      (unsigned)rnd.ptr[ridx+1],
+                      (unsigned)rnd.ptr[ridx+2],
+                      (unsigned)rnd.ptr[ridx+3] };
+
+    // If one of the points is repeated, it's a bad samples, will still
+    // compute homography to ensure all threads pass __syncthreads()
+    bool bad = (r[0] == r[1] || r[0] == r[2] || r[0] == r[3] ||
+                r[1] == r[2] || r[1] == r[3] || r[2] == r[3]);
+
+    for (unsigned j = 0; j < 4; j++) {
+        src_pt_x[j] = x_src.ptr[r[j]];
+        src_pt_y[j] = y_src.ptr[r[j]];
+        dst_pt_x[j] = x_dst.ptr[r[j]];
+        dst_pt_y[j] = y_dst.ptr[r[j]];
+    }
+
+    *x_src_mean = (src_pt_x[0] + src_pt_x[1] + src_pt_x[2] + src_pt_x[3]) / 4.f;
+    *y_src_mean = (src_pt_y[0] + src_pt_y[1] + src_pt_y[2] + src_pt_y[3]) / 4.f;
+    *x_dst_mean = (dst_pt_x[0] + dst_pt_x[1] + dst_pt_x[2] + dst_pt_x[3]) / 4.f;
+    *y_dst_mean = (dst_pt_y[0] + dst_pt_y[1] + dst_pt_y[2] + dst_pt_y[3]) / 4.f;
+
+    float src_var = 0.0f, dst_var = 0.0f;
+    for (unsigned j = 0; j < 4; j++) {
+        src_var += sq(src_pt_x[j] - *x_src_mean) + sq(src_pt_y[j] - *y_src_mean);
+        dst_var += sq(dst_pt_x[j] - *x_dst_mean) + sq(dst_pt_y[j] - *y_dst_mean);
+    }
+
+    src_var /= 4.f;
+    dst_var /= 4.f;
+
+    *src_scale = sqrt(2.0f) / sqrt(src_var);
+    *dst_scale = sqrt(2.0f) / sqrt(dst_var);
+
+    return !bad;
+}
+
+#define APTR(Z, Y, X) (A.ptr[(Z) * A.dims[0] * A.dims[1] + (Y) * A.dims[0] + (X)])
+
+template<typename T>
+__global__ void buildLinearSystem(
+    Param<T> H,
+    Param<T> A,
+    Param<T> V,
+    CParam<float> x_src,
+    CParam<float> y_src,
+    CParam<float> x_dst,
+    CParam<float> y_dst,
+    CParam<float> rnd,
+    const unsigned iterations)
+{
+    unsigned i = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < iterations) {
+        float x_src_mean, y_src_mean;
+        float x_dst_mean, y_dst_mean;
+        float src_scale, dst_scale;
+        float src_pt_x[4], src_pt_y[4], dst_pt_x[4], dst_pt_y[4];
+
+        computeMeanScale(&x_src_mean, &y_src_mean,
+                         &x_dst_mean, &y_dst_mean,
+                         &src_scale, &dst_scale,
+                         src_pt_x, src_pt_y,
+                         dst_pt_x, dst_pt_y,
+                         x_src, y_src, x_dst, y_dst,
+                         rnd, i);
+
+        // Compute input matrix
+        for (unsigned j = threadIdx.x; j < 4; j+=blockDim.x) {
+            float srcx = (src_pt_x[j] - x_src_mean) * src_scale;
+            float srcy = (src_pt_y[j] - y_src_mean) * src_scale;
+            float dstx = (dst_pt_x[j] - x_dst_mean) * dst_scale;
+            float dsty = (dst_pt_y[j] - y_dst_mean) * dst_scale;
+
+            APTR(i, 3, j*2) = -srcx;
+            APTR(i, 4, j*2) = -srcy;
+            APTR(i, 5, j*2) = -1.0f;
+            APTR(i, 6, j*2) = dsty*srcx;
+            APTR(i, 7, j*2) = dsty*srcy;
+            APTR(i, 8, j*2) = dsty;
+
+            APTR(i, 0, j*2+1) = srcx;
+            APTR(i, 1, j*2+1) = srcy;
+            APTR(i, 2, j*2+1) = 1.0f;
+            APTR(i, 6, j*2+1) = -dstx*srcx;
+            APTR(i, 7, j*2+1) = -dstx*srcy;
+            APTR(i, 8, j*2+1) = -dstx;
+        }
+
+        JacobiSVD<T>(A.ptr, V.ptr, 9, 9);
+
+        T vH[9], H_tmp[9];
+        for (unsigned j = 0; j < 9; j++)
+            vH[j] = V.ptr[i * V.dims[0] * V.dims[1] + 8 * V.dims[0] + j];
+
+        H_tmp[0] = src_scale*x_dst_mean*vH[6] + src_scale*vH[0]/dst_scale;
+        H_tmp[1] = src_scale*x_dst_mean*vH[7] + src_scale*vH[1]/dst_scale;
+        H_tmp[2] = x_dst_mean*(vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6]) +
+                              (vH[2] - src_scale*y_src_mean*vH[1] - src_scale*x_src_mean*vH[0])/dst_scale;
+
+        H_tmp[3] = src_scale*y_dst_mean*vH[6] + src_scale*vH[3]/dst_scale;
+        H_tmp[4] = src_scale*y_dst_mean*vH[7] + src_scale*vH[4]/dst_scale;
+        H_tmp[5] = y_dst_mean*(vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6]) +
+                              (vH[5] - src_scale*y_src_mean*vH[4] - src_scale*x_src_mean*vH[3])/dst_scale;
+
+        H_tmp[6] = src_scale*vH[6];
+        H_tmp[7] = src_scale*vH[7];
+        H_tmp[8] = vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6];
+
+        const unsigned Hidx = H.dims[0] * i;
+        T* H_ptr = H.ptr + Hidx;
+        for (int h = 0; h < 9; h++)
+            H_ptr[h] = H_tmp[h];
+    }
+}
+
+#undef APTR
+
+// LMedS: http://research.microsoft.com/en-us/um/people/zhang/INRIA/Publis/Tutorial-Estim/node25.html
+template<typename T>
+__global__ void computeEvalHomography(
+    Param<unsigned> inliers,
+    Param<unsigned> idx,
+    Param<T> H,
+    Param<float> err,
+    CParam<float> x_src,
+    CParam<float> y_src,
+    CParam<float> x_dst,
+    CParam<float> y_dst,
+    CParam<float> rnd,
+    const unsigned iterations,
+    const unsigned nsamples,
+    const float inlier_thr,
+    const af_homography_type htype)
+{
+    unsigned bid_x = blockIdx.x;
+    unsigned tid_x = threadIdx.x;
+    unsigned i = bid_x * blockDim.x + tid_x;
+
+    __shared__ unsigned s_inliers[256];
+    __shared__ unsigned s_idx[256];
+
+    s_inliers[tid_x] = 0;
+    s_idx[tid_x]     = 0;
+    __syncthreads();
+
+    if (i < iterations) {
+        const unsigned Hidx = H.dims[0] * i;
+        T* H_ptr = H.ptr + Hidx;
+        T H_tmp[9];
+        for (int h = 0; h < 9; h++)
+            H_tmp[h] = H_ptr[h];
+
+        if (htype == AF_HOMOGRAPHY_RANSAC) {
+            // Compute inliers
+            unsigned inliers_count = 0;
+            for (unsigned j = 0; j < nsamples; j++) {
+                float z =  H_tmp[6]*x_src.ptr[j] + H_tmp[7]*y_src.ptr[j] + H_tmp[8];
+                float x = (H_tmp[0]*x_src.ptr[j] + H_tmp[1]*y_src.ptr[j] + H_tmp[2]) / z;
+                float y = (H_tmp[3]*x_src.ptr[j] + H_tmp[4]*y_src.ptr[j] + H_tmp[5]) / z;
+
+                float dist = sq(x_dst.ptr[j] - x) + sq(y_dst.ptr[j] - y);
+                if (dist < inlier_thr*inlier_thr)
+                    inliers_count++;
+            }
+
+            s_inliers[tid_x] = inliers_count;
+            s_idx[tid_x]     = i;
+        }
+        else if (htype == AF_HOMOGRAPHY_LMEDS) {
+            // Compute error
+            for (unsigned j = 0; j < nsamples; j++) {
+                float z =  H_tmp[6]*x_src.ptr[j] + H_tmp[7]*y_src.ptr[j] + H_tmp[8];
+                float x = (H_tmp[0]*x_src.ptr[j] + H_tmp[1]*y_src.ptr[j] + H_tmp[2]) / z;
+                float y = (H_tmp[3]*x_src.ptr[j] + H_tmp[4]*y_src.ptr[j] + H_tmp[5]) / z;
+
+                float dist = sq(x_dst.ptr[j] - x) + sq(y_dst.ptr[j] - y);
+                err.ptr[i*err.dims[0] + j] = sqrt(dist);
+            }
+        }
+    }
+
+    if (htype == AF_HOMOGRAPHY_RANSAC) {
+        // Find sample with most inliers
+        for (unsigned tx = 128; tx > 0; tx >>= 1) {
+            if (tid_x < tx) {
+                if (s_inliers[tid_x + tx] > s_inliers[tid_x]) {
+                    s_inliers[tid_x] = s_inliers[tid_x + tx];
+                    s_idx[tid_x]     = s_idx[tid_x + tx];
+                }
+            }
+            __syncthreads();
+        }
+
+        inliers.ptr[bid_x] = s_inliers[0];
+        idx.ptr[bid_x]     = s_idx[0];
+    }
+}
+
+__global__ void computeMedian(
+    Param<float> median,
+    Param<unsigned> idx,
+    CParam<float> err,
+    const unsigned iterations)
+{
+    const unsigned tid = threadIdx.x;
+    const unsigned bid = blockIdx.x;
+    const unsigned i = bid * blockDim.x + threadIdx.x;
+
+    __shared__ float s_median[256];
+    __shared__ unsigned s_idx[256];
+
+    s_median[tid] = FLT_MAX;
+    s_idx[tid] = 0;
+
+    if (i < iterations) {
+        const int nsamples = err.dims[0];
+        float m = err.ptr[i*nsamples + nsamples / 2];
+        if (nsamples % 2 == 0)
+            m = (m + err.ptr[i*nsamples + nsamples / 2 - 1]) * 0.5f;
+
+        s_idx[tid] = i;
+        s_median[tid] = m;
+    }
+    __syncthreads();
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t) {
+            if (s_median[tid + t] < s_median[tid]) {
+                s_median[tid] = s_median[tid + t];
+                s_idx[tid]    = s_idx[tid + t];
+            }
+        }
+        __syncthreads();
+    }
+
+    median.ptr[bid] = s_median[0];
+    idx.ptr[bid] = s_idx[0];
+}
+
+#define DIVUP(A, B) (((A) + (B) - 1) / (B))
+
+__global__ void findMinMedian(
+    float* minMedian,
+    unsigned* minIdx,
+    CParam<float> median,
+    CParam<unsigned> idx)
+{
+    const int tid = threadIdx.x;
+
+    __shared__ float s_minMedian[256];
+    __shared__ unsigned s_minIdx[256];
+
+    s_minMedian[tid] = FLT_MAX;
+    s_minIdx[tid] = 0;
+    __syncthreads();
+
+    const int loop = DIVUP(median.dims[0], blockDim.x);
+
+    for (int i = 0; i < loop; i++) {
+        int j = i * blockDim.x + tid;
+        if (j < median.dims[0] && median.ptr[j] < s_minMedian[tid]) {
+            s_minMedian[tid] = median.ptr[j];
+            s_minIdx[tid] = idx.ptr[j];
+        }
+        __syncthreads();
+    }
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t) {
+            if (s_minMedian[tid + t] < s_minMedian[tid]) {
+                s_minMedian[tid] = s_minMedian[tid + t];
+                s_minIdx[tid]    = s_minIdx[tid + t];
+            }
+        }
+        __syncthreads();
+    }
+
+    *minMedian = s_minMedian[0];
+    *minIdx = s_minIdx[0];
+}
+
+#undef DIVUP
+
+template<typename T>
+__global__ void computeLMedSInliers(
+    Param<unsigned> inliers,
+    CParam<T> H,
+    CParam<float> x_src,
+    CParam<float> y_src,
+    CParam<float> x_dst,
+    CParam<float> y_dst,
+    const float minMedian,
+    const unsigned nsamples)
+{
+    unsigned tid = threadIdx.x;
+    unsigned bid = blockIdx.x;
+    unsigned i = bid * blockDim.x + tid;
+
+    __shared__ T s_H[9];
+    __shared__ unsigned s_inliers[256];
+
+    s_inliers[tid] = 0;
+    __syncthreads();
+
+    if (tid < 9)
+        s_H[tid] = H.ptr[tid];
+    __syncthreads();
+
+    float sigma = max(1.4826f * (1 + 5.f/(nsamples - 4)) * (float)sqrt(minMedian), 1e-6f);
+    float dist_thr = sq(2.5f * sigma);
+
+    if (i < nsamples) {
+        float z =  s_H[6]*x_src.ptr[i] + s_H[7]*y_src.ptr[i] + s_H[8];
+        float x = (s_H[0]*x_src.ptr[i] + s_H[1]*y_src.ptr[i] + s_H[2]) / z;
+        float y = (s_H[3]*x_src.ptr[i] + s_H[4]*y_src.ptr[i] + s_H[5]) / z;
+
+        float dist = sq(x_dst.ptr[i] - x) + sq(y_dst.ptr[i] - y);
+        if (dist <= dist_thr)
+            s_inliers[tid] = 1;
+    }
+    __syncthreads();
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t)
+            s_inliers[tid] += s_inliers[tid + t];
+        __syncthreads();
+    }
+
+    inliers.ptr[bid] = s_inliers[0];
+}
+
+template<typename T>
+int computeH(
+    Param<T> bestH,
+    Param<T> H,
+    Param<T> A,
+    Param<T> V,
+    Param<float> err,
+    CParam<float> x_src,
+    CParam<float> y_src,
+    CParam<float> x_dst,
+    CParam<float> y_dst,
+    CParam<float> rnd,
+    const unsigned iterations,
+    const unsigned nsamples,
+    const float inlier_thr,
+    const af_homography_type htype)
+{
+    dim3 threads(16, 16);
+    dim3 blocks(1, divup(iterations, threads.y));
+
+    // Build linear system and solve SVD
+    CUDA_LAUNCH((buildLinearSystem<T>), blocks, threads,
+                H, A, V, x_src, y_src, x_dst, y_dst, rnd, iterations);
+    POST_LAUNCH_CHECK();
+
+    threads = dim3(256);
+    blocks = dim3(divup(iterations, threads.x));
+
+    // Allocate some temporary buffers
+    Param<unsigned> idx, inliers;
+    Param<float> median;
+    inliers.dims[0] = (htype == AF_HOMOGRAPHY_RANSAC) ? blocks.x : divup(nsamples, threads.x);
+    inliers.strides[0] = 1;
+    idx.dims[0] = median.dims[0] = blocks.x;
+    idx.strides[0] = median.strides[0] = 1;
+    for (int k = 1; k < 4; k++) {
+        inliers.dims[k] = 1;
+        inliers.strides[k] = inliers.dims[k-1] * inliers.strides[k-1];
+        idx.dims[k] = median.dims[k] = 1;
+        idx.strides[k] = median.strides[k] = idx.dims[k-1] * idx.strides[k-1];
+    }
+    idx.ptr = memAlloc<unsigned>(idx.dims[3] * idx.strides[3]);
+    inliers.ptr = memAlloc<unsigned>(inliers.dims[3] * inliers.strides[3]);
+    if (htype == AF_HOMOGRAPHY_LMEDS)
+        median.ptr = memAlloc<float>(median.dims[3] * median.strides[3]);
+
+    // Compute (and for RANSAC, evaluate) homographies
+    CUDA_LAUNCH((computeEvalHomography<T>), blocks, threads,
+                 inliers, idx, H, err, x_src, y_src, x_dst, y_dst,
+                 rnd, iterations, nsamples, inlier_thr, htype);
+    POST_LAUNCH_CHECK();
+
+    unsigned inliersH, idxH;
+    if (htype == AF_HOMOGRAPHY_LMEDS) {
+        // TODO: Improve this sorting, if the number of iterations is
+        // sufficiently large, this can be *very* slow
+        kernel::sort0<float, true>(err);
+
+        unsigned minIdx;
+        float minMedian;
+
+        // Compute median of every iteration
+        CUDA_LAUNCH((computeMedian), blocks, threads,
+                    median, idx, err, iterations);
+        POST_LAUNCH_CHECK();
+
+        // Reduce medians, only in case iterations > 256
+        if (blocks.x > 1) {
+            blocks = dim3(1);
+
+            float* finalMedian = memAlloc<float>(1);
+            unsigned* finalIdx = memAlloc<unsigned>(1);
+
+            CUDA_LAUNCH((findMinMedian), blocks, threads,
+                        finalMedian, finalIdx, median, idx);
+            POST_LAUNCH_CHECK();
+
+            CUDA_CHECK(cudaMemcpy(&minMedian, finalMedian, sizeof(float), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(cudaMemcpy(&minIdx, finalIdx, sizeof(unsigned), cudaMemcpyDeviceToHost));
+
+            memFree(finalMedian);
+            memFree(finalIdx);
+        }
+        else {
+            CUDA_CHECK(cudaMemcpy(&minMedian, median.ptr, sizeof(float), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(cudaMemcpy(&minIdx, idx.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost));
+        }
+
+        // Copy best homography to output
+        CUDA_CHECK(cudaMemcpy(bestH.ptr, H.ptr + minIdx * 9, 9*sizeof(T), cudaMemcpyDeviceToDevice));
+
+        blocks = dim3(divup(nsamples, threads.x));
+
+        CUDA_LAUNCH((computeLMedSInliers<T>), blocks, threads,
+                    inliers, bestH, x_src, y_src, x_dst, y_dst,
+                    minMedian, nsamples);
+        POST_LAUNCH_CHECK();
+
+        // Adds up the total number of inliers
+        Param<unsigned> totalInliers;
+        for (int k = 0; k < 4; k++)
+            totalInliers.dims[k] = totalInliers.strides[k] = 1;
+        totalInliers.ptr = memAlloc<unsigned>(1);
+
+        kernel::reduce<unsigned, unsigned, af_add_t>(totalInliers, inliers, 0, false, 0.0);
+
+        CUDA_CHECK(cudaMemcpy(&inliersH, totalInliers.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost));
+
+        memFree(totalInliers.ptr);
+        memFree(median.ptr);
+    }
+    else if (htype == AF_HOMOGRAPHY_RANSAC) {
+        Param<unsigned> bestInliers, bestIdx;
+        for (int k = 0; k < 4; k++) {
+            bestInliers.dims[k] = bestIdx.dims[k] = 1;
+            bestInliers.strides[k] = bestIdx.strides[k] = 1;
+        }
+        bestInliers.ptr = memAlloc<unsigned>(1);
+        bestIdx.ptr = memAlloc<unsigned>(1);
+
+        kernel::ireduce<unsigned, af_max_t>(bestInliers, bestIdx.ptr, inliers, 0);
+
+        unsigned blockIdx;
+        CUDA_CHECK(cudaMemcpy(&blockIdx, bestIdx.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost));
+
+        // Copies back index and number of inliers of best homography estimation
+        CUDA_CHECK(cudaMemcpy(&idxH, idx.ptr+blockIdx, sizeof(unsigned), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpy(&inliersH, bestInliers.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost));
+
+        CUDA_CHECK(cudaMemcpy(bestH.ptr, H.ptr + idxH * 9, 9*sizeof(T), cudaMemcpyDeviceToDevice));
+
+        memFree(bestInliers.ptr);
+        memFree(bestIdx.ptr);
+    }
+
+    memFree(inliers.ptr);
+    memFree(idx.ptr);
+
+    return (int)inliersH;
+}
+
+} // namespace kernel
+
+} // namespace cuda
diff --git a/src/backend/cuda/kernel/ireduce.hpp b/src/backend/cuda/kernel/ireduce.hpp
index 4354f2a5fa..7aaeb248cd 100644
--- a/src/backend/cuda/kernel/ireduce.hpp
+++ b/src/backend/cuda/kernel/ireduce.hpp
@@ -492,8 +492,11 @@ namespace kernel
             T*      h_ptr_raw = h_ptr.get();
             uint*   h_lptr_raw = h_lptr.get();
 
-            CUDA_CHECK(cudaMemcpy(h_ptr_raw, tmp.ptr, tmp_elements * sizeof(T), cudaMemcpyDeviceToHost));
-            CUDA_CHECK(cudaMemcpy(h_lptr_raw, tlptr, tmp_elements * sizeof(uint), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(cudaMemcpyAsync(h_ptr_raw, tmp.ptr, tmp_elements * sizeof(T),
+                       cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+            CUDA_CHECK(cudaMemcpyAsync(h_lptr_raw, tlptr, tmp_elements * sizeof(uint),
+                       cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+            CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
             memFree(tmp.ptr);
             memFree(tlptr);
 
@@ -520,7 +523,9 @@ namespace kernel
 
             scoped_ptr<T> h_ptr(new T[in_elements]);
             T* h_ptr_raw = h_ptr.get();
-            CUDA_CHECK(cudaMemcpy(h_ptr_raw, in.ptr, in_elements * sizeof(T), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(cudaMemcpyAsync(h_ptr_raw, in.ptr, in_elements * sizeof(T),
+                       cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+            CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
 
             MinMaxOp<op, T> Op(h_ptr_raw[0], 0);
             for (int i = 1; i < in_elements; i++) {
diff --git a/src/backend/cuda/kernel/memcopy.hpp b/src/backend/cuda/kernel/memcopy.hpp
index 4d5d19231f..dc437b4142 100644
--- a/src/backend/cuda/kernel/memcopy.hpp
+++ b/src/backend/cuda/kernel/memcopy.hpp
@@ -142,6 +142,8 @@ namespace kernel
     OTHER_SPECIALIZATIONS(uint  )
     OTHER_SPECIALIZATIONS(intl   )
     OTHER_SPECIALIZATIONS(uintl  )
+    OTHER_SPECIALIZATIONS(short  )
+    OTHER_SPECIALIZATIONS(ushort )
     OTHER_SPECIALIZATIONS(uchar )
     OTHER_SPECIALIZATIONS(char  )
     ////////////////////////////// END - templated help functions for copy_kernel //////////////////////////////////
diff --git a/src/backend/cuda/kernel/nearest_neighbour.hpp b/src/backend/cuda/kernel/nearest_neighbour.hpp
index 14c448f219..9b14cb5da6 100644
--- a/src/backend/cuda/kernel/nearest_neighbour.hpp
+++ b/src/backend/cuda/kernel/nearest_neighbour.hpp
@@ -68,6 +68,15 @@ struct dist_op<uintl, To, AF_SHD>
     }
 };
 
+template<typename To>
+struct dist_op<ushort, To, AF_SHD>
+{
+    __device__ To operator()(ushort v1, ushort v2)
+    {
+        return __popc(v1 ^ v2);
+    }
+};
+
 template<typename To>
 struct dist_op<uchar, To, AF_SHD>
 {
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index 493540599f..89de56065d 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -330,7 +330,8 @@ void orb(unsigned* out_feat,
 
     // In future implementations, the user will be capable of passing his
     // distribution instead of using the reference one
-    //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice));
+    //CUDA_CHECK(cudaMemcpyToSymbolAsync(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0,
+    // cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
 
     vector<float*> d_score_pyr(max_levels);
     vector<float*> d_ori_pyr(max_levels);
@@ -356,7 +357,9 @@ void orb(unsigned* out_feat,
 
         int gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
         gauss_filter.ptr = memAlloc<convAccT>(gauss_elem);
-        CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss.get(), gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(gauss_filter.ptr, h_gauss.get(), gauss_elem * sizeof(convAccT),
+                    cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
     }
 
     for (int i = 0; i < (int)max_levels; i++) {
diff --git a/src/backend/cuda/kernel/reduce.hpp b/src/backend/cuda/kernel/reduce.hpp
index 89b604e3a9..118ba4e87c 100644
--- a/src/backend/cuda/kernel/reduce.hpp
+++ b/src/backend/cuda/kernel/reduce.hpp
@@ -414,7 +414,9 @@ namespace kernel
             scoped_ptr<To> h_ptr(new To[tmp_elements]);
             To* h_ptr_raw = h_ptr.get();
 
-            CUDA_CHECK(cudaMemcpy(h_ptr_raw, tmp.ptr, tmp_elements * sizeof(To), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(cudaMemcpyAsync(h_ptr_raw, tmp.ptr, tmp_elements * sizeof(To),
+                       cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+            CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
             memFree(tmp.ptr);
 
             Binary<To, op> reduce;
@@ -429,7 +431,9 @@ namespace kernel
 
             scoped_ptr<Ti> h_ptr(new Ti[in_elements]);
             Ti* h_ptr_raw = h_ptr.get();
-            CUDA_CHECK(cudaMemcpy(h_ptr_raw, in.ptr, in_elements * sizeof(Ti), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(cudaMemcpyAsync(h_ptr_raw, in.ptr, in_elements * sizeof(Ti),
+                       cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+            CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
 
             Transform<Ti, To, op> transform;
             Binary<To, op> reduce;
diff --git a/src/backend/cuda/kernel/regions.hpp b/src/backend/cuda/kernel/regions.hpp
index 27f1029302..87fa78c808 100644
--- a/src/backend/cuda/kernel/regions.hpp
+++ b/src/backend/cuda/kernel/regions.hpp
@@ -419,15 +419,18 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
 
     while (h_continue) {
         h_continue = 0;
-        CUDA_CHECK(cudaMemcpyToSymbol(continue_flag, &h_continue, sizeof(int),
-                                      0, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpyToSymbolAsync(continue_flag, &h_continue, sizeof(int),
+                    0, cudaMemcpyHostToDevice,
+                    cuda::getStream(cuda::getActiveDeviceId())));
 
         CUDA_LAUNCH((update_equiv<T, 16, n_per_thread, full_conn>), blocks, threads, out, tex);
 
         POST_LAUNCH_CHECK();
 
-        CUDA_CHECK(cudaMemcpyFromSymbol(&h_continue, continue_flag, sizeof(int),
-                                        0, cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_continue, continue_flag, sizeof(int),
+                    0, cudaMemcpyDeviceToHost,
+                    cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
     }
 
     // Now, perform the final relabeling.  This converts the equivalency
diff --git a/src/backend/cuda/kernel/shared.hpp b/src/backend/cuda/kernel/shared.hpp
index eb7b432a12..ab7f6d9764 100644
--- a/src/backend/cuda/kernel/shared.hpp
+++ b/src/backend/cuda/kernel/shared.hpp
@@ -44,7 +44,11 @@ SPECIALIZE(cdouble)
 SPECIALIZE(char)
 SPECIALIZE(int)
 SPECIALIZE(uint)
+SPECIALIZE(short)
+SPECIALIZE(ushort)
 SPECIALIZE(uchar)
+SPECIALIZE(intl)
+SPECIALIZE(uintl)
 
 #undef SPECIALIZE
 
diff --git a/src/backend/cuda/kernel/sift_nonfree.hpp b/src/backend/cuda/kernel/sift_nonfree.hpp
index b834117e73..bcc8ac0566 100644
--- a/src/backend/cuda/kernel/sift_nonfree.hpp
+++ b/src/backend/cuda/kernel/sift_nonfree.hpp
@@ -142,6 +142,18 @@ static const dim_t SIFT_THREADS_Y = 8;
 // factor used to convert floating-podescriptor to unsigned char
 #define INT_DESCR_FCTR 512.f
 
+// Number of GLOH bins in radial direction
+static const unsigned GLOHRadialBins = 3;
+
+// Radii of GLOH descriptors
+__constant__ float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
+
+// Number of GLOH angular bins (excluding the inner-most radial section)
+static const unsigned GLOHAngularBins = 8;
+
+// Number of GLOH bins per histogram in descriptor
+static const unsigned GLOHHistBins = 16;
+
 template<typename T>
 void gaussian1D(T* out, const int dim, double sigma=0.0)
 {
@@ -179,7 +191,9 @@ Param<T> gauss_filter(float sigma)
 
     dim_t gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
     gauss_filter.ptr = memAlloc<T>(gauss_elem);
-    CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(T), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(T),
+                cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+    CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
 
     delete[] h_gauss;
 
@@ -230,7 +244,7 @@ __inline__ __device__ void normalizeDesc(
     int bsz_x = blockDim.x;
 
     for (int i = tid_x; i < histlen; i += bsz_x)
-        accum[tid_x] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
+        accum[i] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
     __syncthreads();
 
     if (tid_x < 64)
@@ -264,6 +278,54 @@ __inline__ __device__ void normalizeDesc(
     __syncthreads();
 }
 
+__inline__ __device__ void normalizeGLOHDesc(
+    float* desc,
+    float* accum,
+    const int histlen)
+{
+    int tid_x = threadIdx.x;
+    int tid_y = threadIdx.y;
+    int bsz_x = blockDim.x;
+
+    for (int i = tid_x; i < histlen; i += bsz_x)
+        accum[i] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
+    __syncthreads();
+
+    if (tid_x < 128)
+        accum[tid_x] += accum[tid_x+128];
+    __syncthreads();
+    if (tid_x < 64)
+        accum[tid_x] += accum[tid_x+64];
+    __syncthreads();
+    if (tid_x < 32)
+        accum[tid_x] += accum[tid_x+32];
+    __syncthreads();
+    if (tid_x < 16)
+        // GLOH is 272-dimensional, accumulating last 16 descriptors
+        accum[tid_x] += accum[tid_x+16] + accum[tid_x+256];
+    __syncthreads();
+    if (tid_x < 8)
+        accum[tid_x] += accum[tid_x+8];
+    __syncthreads();
+    if (tid_x < 4)
+        accum[tid_x] += accum[tid_x+4];
+    __syncthreads();
+    if (tid_x < 2)
+        accum[tid_x] += accum[tid_x+2];
+    __syncthreads();
+    if (tid_x < 1)
+        accum[tid_x] += accum[tid_x+1];
+    __syncthreads();
+
+    float len_sq = accum[0];
+    float len_inv = 1.0f / sqrtf(len_sq);
+
+    for (int i = tid_x; i < histlen; i += bsz_x) {
+        desc[tid_y*histlen+i] *= len_inv;
+    }
+    __syncthreads();
+}
+
 template<typename T>
 __global__ void sub(
     Param<T> out,
@@ -273,8 +335,10 @@ __global__ void sub(
 {
     unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    for (unsigned l = 0; l < n_layers; l++)
-        out.ptr[l*nel + i] = in.ptr[(l+1)*nel + i] - in.ptr[l*nel + i];
+    if (i < nel) {
+        for (unsigned l = 0; l < n_layers; l++)
+            out.ptr[l*nel + i] = in.ptr[(l+1)*nel + i] - in.ptr[l*nel + i];
+    }
 }
 
 #define SCPTR(Y, X) (s_center[(Y) * s_i + (X)])
@@ -759,10 +823,8 @@ __global__ void computeDescriptor(
     float* desc = shrdMem;
     float* accum = shrdMem + desc_len * histsz;
 
-    const int histlen = (d)*(d)*(n);
-
-    for (int i = tid_x; i < histlen*histsz; i += bsz_x)
-        desc[tid_y*histlen+i] = 0.f;
+    for (int i = tid_x; i < desc_len*histsz; i += bsz_x)
+        desc[tid_y*desc_len+i] = 0.f;
     __syncthreads();
 
     if (f < total_feat) {
@@ -859,17 +921,184 @@ __global__ void computeDescriptor(
         desc[l] += desc[l+desc_len];
     __syncthreads();
 
-    normalizeDesc(desc, accum, histlen);
+    normalizeDesc(desc, accum, desc_len);
 
-    for (int i = tid_x; i < d*d*n; i += bsz_x)
+    for (int i = tid_x; i < desc_len; i += bsz_x)
         desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
     __syncthreads();
 
-    normalizeDesc(desc, accum, histlen);
+    normalizeDesc(desc, accum, desc_len);
 
     if (f < total_feat) {
         // Calculate final descriptor values
-        for (int k = tid_x; k < d*d*n; k += bsz_x)
+        for (int k = tid_x; k < desc_len; k += bsz_x)
+            desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*desc_len+k] * INT_DESCR_FCTR));
+    }
+}
+
+// Computes GLOH feature descriptors for features in an array. Based on Section III-B
+// of Mikolajczyk and Schmid paper.
+template<typename T>
+__global__ void computeGLOHDescriptor(
+    float* desc_out,
+    const unsigned desc_len,
+    const unsigned histsz,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const float* response_in,
+    const float* size_in,
+    const float* ori_in,
+    const unsigned total_feat,
+    const CParam<T> gauss_octave,
+    const int d,
+    const unsigned rb,
+    const unsigned ab,
+    const unsigned hb,
+    const float scale,
+    const int n_layers)
+{
+    const int tid_x = threadIdx.x;
+    const int tid_y = threadIdx.y;
+    const int bsz_x = blockDim.x;
+    const int bsz_y = blockDim.y;
+
+    const int f = blockIdx.y * bsz_y + tid_y;
+
+    SharedMemory<float> shared;
+    float* shrdMem = shared.getPointer();
+    float* desc = shrdMem;
+    float* accum = shrdMem + desc_len * histsz;
+
+    for (int i = tid_x; i < desc_len*histsz; i += bsz_x)
+        desc[tid_y*desc_len+i] = 0.f;
+    __syncthreads();
+
+    if (f < total_feat) {
+        const unsigned layer = layer_in[f];
+        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+        const float size = size_in[f];
+        const int fx = round(x_in[f] * scale);
+        const int fy = round(y_in[f] * scale);
+
+        const int dim0 = gauss_octave.dims[0];
+        const int dim1 = gauss_octave.dims[1];
+        const int imel = dim0 * dim1;
+
+        // Points img to correct Gaussian pyramid layer
+        const T* img_ptr = gauss_octave.ptr + layer * imel;
+
+        float cos_t = cosf(ori);
+        float sin_t = sinf(ori);
+        float hist_bins_per_rad = hb / (PI_VAL * 2.f);
+        float polar_bins_per_rad = ab / (PI_VAL * 2.f);
+        float exp_denom = GLOHRadii[rb-1] * 0.5f;
+
+        float hist_width = DESCR_SCL_FCTR * size * scale * 0.5f;
+
+        // Keep same descriptor radius used for SIFT
+        int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
+
+        // Alternative radius size calculation, changing the radius weight
+        // (rw) in the range of 0.25f-0.75f gives different results,
+        // increasing it tends to show a better recall rate but with a
+        // smaller amount of correct matches
+        //float rw = 0.5f;
+        //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
+
+        int len = radius*2+1;
+        const int hist_off = (tid_x % histsz) * desc_len;
+
+        // Calculate orientation histogram
+        for (int l = tid_x; l < len*len; l += bsz_x) {
+            int i = l / len - radius;
+            int j = l % len - radius;
+
+            int y = fy + i;
+            int x = fx + j;
+
+            float x_rot = (j * cos_t - i * sin_t);
+            float y_rot = (j * sin_t + i * cos_t);
+
+            float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
+            float theta = atan2(y_rot, x_rot);
+            while (theta < 0.0f)
+                theta += PI_VAL*2;
+            while (theta >= PI_VAL*2)
+                theta -= PI_VAL*2;
+
+            float tbin = theta * polar_bins_per_rad;
+            float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
+                         ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
+                         min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
+
+            if (r <= GLOHRadii[rb-1] &&
+                y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
+                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+                float grad_mag = sqrtf(dx*dx + dy*dy);
+                float grad_ori = atan2f(dy, dx) - ori;
+                while (grad_ori < 0.0f)
+                    grad_ori += PI_VAL*2;
+                while (grad_ori >= PI_VAL*2)
+                    grad_ori -= PI_VAL*2;
+
+                float w = exp(-r / exp_denom);
+                float obin = grad_ori * hist_bins_per_rad;
+                float mag = grad_mag*w;
+
+                int t0 = floor(tbin);
+                int r0 = floor(rbin);
+                int o0 = floor(obin);
+                tbin -= t0;
+                rbin -= r0;
+                obin -= o0;
+
+                for (int rl = 0; rl <= 1; rl++) {
+                    int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
+                    float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
+                    if (rb >= 0 && rb <= 2) {
+                        for (int tl = 0; tl <= 1; tl++) {
+                            int tb = (t0 + tl) % ab;
+                            float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+                            for (int ol = 0; ol <= 1; ol++) {
+                                int ob = (o0 + ol) % hb;
+                                float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
+                                unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
+                                atomicAdd(&desc[hist_off + tid_y*desc_len + idx], v_o);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    __syncthreads();
+
+    // Combine histograms (reduces previous atomicAdd overhead)
+    for (int l = tid_x; l < desc_len*4; l += bsz_x)
+        desc[l] += desc[l+4*desc_len];
+    __syncthreads();
+    for (int l = tid_x; l < desc_len*2; l += bsz_x)
+        desc[l    ] += desc[l+2*desc_len];
+    __syncthreads();
+    for (int l = tid_x; l < desc_len; l += bsz_x)
+        desc[l] += desc[l+desc_len];
+    __syncthreads();
+
+    normalizeGLOHDesc(desc, accum, desc_len);
+
+    for (int i = tid_x; i < desc_len; i += bsz_x)
+        desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
+    __syncthreads();
+
+    normalizeGLOHDesc(desc, accum, desc_len);
+
+    if (f < total_feat) {
+        // Calculate final descriptor values
+        for (int k = tid_x; k < desc_len; k += bsz_x)
             desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*desc_len+k] * INT_DESCR_FCTR));
     }
 }
@@ -1010,8 +1239,9 @@ std::vector< Param<T> > buildGaussPyr(
             const unsigned imel = tmp_pyr[idx].dims[3] * tmp_pyr[idx].strides[3];
             const unsigned offset = imel * l;
 
-            //getQueue().enqueueCopyBuffer(*tmp_pyr[idx].data, *gauss_pyr[o].data, 0, offset*sizeof(T), imel * sizeof(T));
-            CUDA_CHECK(cudaMemcpy(gauss_pyr[o].ptr + offset, tmp_pyr[idx].ptr, imel * sizeof(T), cudaMemcpyDeviceToDevice));
+            CUDA_CHECK(cudaMemcpyAsync(gauss_pyr[o].ptr + offset, tmp_pyr[idx].ptr,
+                        imel * sizeof(T), cudaMemcpyDeviceToDevice,
+                        cuda::getStream(cuda::getActiveDeviceId())));
         }
     }
 
@@ -1093,7 +1323,8 @@ void sift(unsigned* out_feat,
           const float init_sigma,
           const bool double_input,
           const float img_scale,
-          const float feature_ratio)
+          const float feature_ratio,
+          const bool compute_GLOH)
 {
     const unsigned min_dim = (double_input) ? min(img.dims[0]*2, img.dims[1]*2)
                                             : min(img.dims[0], img.dims[1]);
@@ -1116,7 +1347,10 @@ void sift(unsigned* out_feat,
 
     const unsigned d = DESCR_WIDTH;
     const unsigned n = DESCR_HIST_BINS;
-    const unsigned desc_len = d*d*n;
+    const unsigned rb = GLOHRadialBins;
+    const unsigned ab = GLOHAngularBins;
+    const unsigned hb = GLOHHistBins;
+    const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
 
     unsigned* d_count = memAlloc<unsigned>(1);
     for (unsigned i = 0; i < n_octaves; i++) {
@@ -1148,7 +1382,9 @@ void sift(unsigned* out_feat,
         POST_LAUNCH_CHECK();
 
         unsigned extrema_feat = 0;
-        CUDA_CHECK(cudaMemcpy(&extrema_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&extrema_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost,
+                    cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
         extrema_feat = min(extrema_feat, max_feat);
 
         if (extrema_feat == 0) {
@@ -1185,7 +1421,9 @@ void sift(unsigned* out_feat,
         memFree(d_extrema_y);
         memFree(d_extrema_layer);
 
-        CUDA_CHECK(cudaMemcpy(&interp_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&interp_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost,
+                    cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
         interp_feat = min(interp_feat, max_feat);
 
         CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
@@ -1245,7 +1483,9 @@ void sift(unsigned* out_feat,
         memFree(d_interp_size);
 
         unsigned nodup_feat = 0;
-        CUDA_CHECK(cudaMemcpy(&nodup_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&nodup_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost,
+                    cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
         CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
                                    cuda::getStream(cuda::getActiveDeviceId())));
 
@@ -1277,7 +1517,9 @@ void sift(unsigned* out_feat,
         memFree(d_nodup_size);
 
         unsigned oriented_feat = 0;
-        CUDA_CHECK(cudaMemcpy(&oriented_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&oriented_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost,
+                    cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
         oriented_feat = min(oriented_feat, max_oriented_feat);
 
         if (oriented_feat == 0) {
@@ -1302,11 +1544,19 @@ void sift(unsigned* out_feat,
         const unsigned histsz = 8;
         const size_t shared_size = desc_len * (histsz+1) * sizeof(float);
 
-        CUDA_LAUNCH_SMEM((computeDescriptor<T>), blocks, threads, shared_size,
-                         d_desc, desc_len, histsz,
-                         d_oriented_x, d_oriented_y, d_oriented_layer,
-                         d_oriented_response, d_oriented_size, d_oriented_ori,
-                         oriented_feat, gauss_pyr[i], d, n, scale, n_layers);
+        if (compute_GLOH)
+            CUDA_LAUNCH_SMEM((computeGLOHDescriptor<T>), blocks, threads, shared_size,
+                             d_desc, desc_len, histsz,
+                             d_oriented_x, d_oriented_y, d_oriented_layer,
+                             d_oriented_response, d_oriented_size, d_oriented_ori,
+                             oriented_feat, gauss_pyr[i], d, rb, ab, hb,
+                             scale, n_layers);
+        else
+            CUDA_LAUNCH_SMEM((computeDescriptor<T>), blocks, threads, shared_size,
+                             d_desc, desc_len, histsz,
+                             d_oriented_x, d_oriented_y, d_oriented_layer,
+                             d_oriented_response, d_oriented_size, d_oriented_ori,
+                             oriented_feat, gauss_pyr[i], d, n, scale, n_layers);
         POST_LAUNCH_CHECK();
 
         total_feat += oriented_feat;
@@ -1342,14 +1592,20 @@ void sift(unsigned* out_feat,
         if (feat_pyr[i] == 0)
             continue;
 
-        CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_score+offset, d_response_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-
-        CUDA_CHECK(cudaMemcpy(*d_desc+(offset*desc_len), d_desc_pyr[i],
-                             feat_pyr[i] * desc_len * sizeof(float), cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_score+offset, d_response_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+
+        CUDA_CHECK(cudaMemcpyAsync(*d_desc+(offset*desc_len), d_desc_pyr[i],
+                    feat_pyr[i] * desc_len * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
 
         memFree(d_x_pyr[i]);
         memFree(d_y_pyr[i]);
diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index fdbd88a2f0..30b40baf89 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -171,7 +171,9 @@ void nonMaximal(float* x_out, float* y_out, float* resp_out,
 
     POST_LAUNCH_CHECK();
 
-    CUDA_CHECK(cudaMemcpy(count, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(count, d_corners_found, sizeof(unsigned),
+                cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+    CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
     memFree(d_corners_found);
 }
 
diff --git a/src/backend/cuda/kernel/triangle.hpp b/src/backend/cuda/kernel/triangle.hpp
index 374f6b18be..8d335d6113 100644
--- a/src/backend/cuda/kernel/triangle.hpp
+++ b/src/backend/cuda/kernel/triangle.hpp
@@ -43,6 +43,9 @@ namespace cuda
             T *d_r = r.ptr;
             const T *d_i = in.ptr;
 
+            const T one  = scalar<T>(1);
+            const T zero = scalar<T>(0);
+
             if(oz < r.dims[2] && ow < r.dims[3]) {
                 d_i = d_i + oz * in.strides[2]    + ow * in.strides[3];
                 d_r = d_r + oz * r.strides[2] + ow * r.strides[3];
@@ -56,9 +59,10 @@ namespace cuda
                         bool cond = is_upper ? (oy >= ox) : (oy <= ox);
                         bool do_unit_diag  = is_unit_diag && (ox == oy);
                         if(cond) {
-                            Yd_r[ox] = do_unit_diag ? scalar<T>(1) : Yd_i[ox];
+                            // Change made because of compute 53 failing tests
+                            Yd_r[ox] = do_unit_diag ? one : Yd_i[ox];
                         } else {
-                            Yd_r[ox] = scalar<T>(0);
+                            Yd_r[ox] = zero;
                         }
                     }
                 }
diff --git a/src/backend/cuda/kernel/where.hpp b/src/backend/cuda/kernel/where.hpp
index fb2fd1ddca..746e2b82ac 100644
--- a/src/backend/cuda/kernel/where.hpp
+++ b/src/backend/cuda/kernel/where.hpp
@@ -117,8 +117,10 @@ namespace kernel
 
         // Get output size and allocate output
         uint total;
-        CUDA_CHECK(cudaMemcpy(&total, rtmp.ptr + rtmp_elements - 1,
-                              sizeof(uint), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(&total, rtmp.ptr + rtmp_elements - 1,
+                              sizeof(uint), cudaMemcpyDeviceToHost,
+                              cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
 
         out.ptr = memAlloc<uint>(total);
 
diff --git a/src/backend/cuda/lookup.cu b/src/backend/cuda/lookup.cu
index 8f910dea6a..70c9ed90b7 100644
--- a/src/backend/cuda/lookup.cu
+++ b/src/backend/cuda/lookup.cu
@@ -42,6 +42,10 @@ Array<in_t> lookup(const Array<in_t> &input, const Array<idx_t> &indices, const
     template Array<T> lookup<T, double  >(const Array<T> &input, const Array<double  > &indices, const unsigned dim); \
     template Array<T> lookup<T, int     >(const Array<T> &input, const Array<int     > &indices, const unsigned dim); \
     template Array<T> lookup<T, unsigned>(const Array<T> &input, const Array<unsigned> &indices, const unsigned dim); \
+    template Array<T> lookup<T, short   >(const Array<T> &input, const Array<short   > &indices, const unsigned dim); \
+    template Array<T> lookup<T, ushort  >(const Array<T> &input, const Array<ushort  > &indices, const unsigned dim); \
+    template Array<T> lookup<T, intl    >(const Array<T> &input, const Array<intl    > &indices, const unsigned dim); \
+    template Array<T> lookup<T, uintl   >(const Array<T> &input, const Array<uintl   > &indices, const unsigned dim); \
     template Array<T> lookup<T, uchar   >(const Array<T> &input, const Array<uchar   > &indices, const unsigned dim);
 
 INSTANTIATE(float   );
@@ -54,5 +58,7 @@ INSTANTIATE(intl    );
 INSTANTIATE(uintl   );
 INSTANTIATE(uchar   );
 INSTANTIATE(char    );
+INSTANTIATE(short   );
+INSTANTIATE(ushort  );
 
 }
diff --git a/src/backend/cuda/lu.cu b/src/backend/cuda/lu.cu
index 85dedf50e0..2a45d4b9f5 100644
--- a/src/backend/cuda/lu.cu
+++ b/src/backend/cuda/lu.cu
@@ -166,6 +166,36 @@ INSTANTIATE_LU(double)
 INSTANTIATE_LU(cdouble)
 }
 
+#elif defined(WITH_CPU_LINEAR_ALGEBRA)
+////////////////////////////////////////////////////////////////////////////////
+// For versions earlier than CUDA 7, use CPU fallback
+////////////////////////////////////////////////////////////////////////////////
+#include <cpu_lapack/cpu_lu.hpp>
+
+namespace cuda
+{
+template<typename T>
+void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
+{
+    return cpu::lu(lower, upper, pivot, in);
+}
+
+template<typename T>
+Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
+{
+    return cpu::lu_inplace(in, convert_pivot);
+}
+
+#define INSTANTIATE_LU(T)                                                                           \
+    template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
+    template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+INSTANTIATE_LU(float)
+INSTANTIATE_LU(cfloat)
+INSTANTIATE_LU(double)
+INSTANTIATE_LU(cdouble)
+}
+
 #else
 namespace cuda
 {
diff --git a/src/backend/cuda/match_template.cu b/src/backend/cuda/match_template.cu
index 5b30eb03e8..0ce0ce20e2 100644
--- a/src/backend/cuda/match_template.cu
+++ b/src/backend/cuda/match_template.cu
@@ -54,5 +54,7 @@ INSTANTIATE(char  ,  float)
 INSTANTIATE(int   ,  float)
 INSTANTIATE(uint  ,  float)
 INSTANTIATE(uchar ,  float)
+INSTANTIATE(short ,  float)
+INSTANTIATE(ushort,  float)
 
 }
diff --git a/src/backend/cuda/math.hpp b/src/backend/cuda/math.hpp
index 577db84628..ad7563f672 100644
--- a/src/backend/cuda/math.hpp
+++ b/src/backend/cuda/math.hpp
@@ -23,6 +23,8 @@
 namespace cuda
 {
     template<typename T> static inline __DH__ T abs(T val)  { return abs(val); }
+    static inline __DH__ int  abs(int  val) { return (val>0? val : -val); }
+    static inline __DH__ char  abs(char  val) { return (val>0? val : -val); }
     static inline __DH__ float  abs(float  val) { return fabsf(val); }
     static inline __DH__ double abs(double val) { return fabs (val); }
     static inline __DH__ float  abs(cfloat  cval) { return cuCabsf(cval); }
@@ -108,6 +110,9 @@ namespace cuda
     template<> __device__  float  limit_min<float>()  { return -CUDART_INF_F; }
     template<> __device__  double limit_max<double>() { return  CUDART_INF; }
     template<> __device__  double limit_min<double>() { return -CUDART_INF; }
+    template<> __device__  short  limit_max<short>()  { return 0x7fff; }
+    template<> __device__  short  limit_min<short>()  { return 0x8000; }
+    template<> __device__  ushort limit_max<ushort>() { return ((ushort)1) << (8 * sizeof(ushort) - 1); }
 #endif
 
 #define upcast cuComplexFloatToDouble
@@ -134,6 +139,8 @@ __SDH__ cdouble conj(cdouble c) { return cuConj(c); }
 __SDH__ cfloat make_cfloat(bool     x) { return make_cuComplex(x,0);     }
 __SDH__ cfloat make_cfloat(int      x) { return make_cuComplex(x,0);     }
 __SDH__ cfloat make_cfloat(unsigned x) { return make_cuComplex(x,0);     }
+__SDH__ cfloat make_cfloat(short    x) { return make_cuComplex(x,0);     }
+__SDH__ cfloat make_cfloat(ushort   x) { return make_cuComplex(x,0);     }
 __SDH__ cfloat make_cfloat(float    x) { return make_cuComplex(x,0);     }
 __SDH__ cfloat make_cfloat(double   x) { return make_cuComplex(x,0);     }
 __SDH__ cfloat make_cfloat(cfloat   x) { return x;                    }
@@ -142,6 +149,8 @@ __SDH__ cfloat make_cfloat(cdouble  c) { return make_cuComplex(c.x,c.y); }
 __SDH__ cdouble make_cdouble(bool      x) { return make_cuDoubleComplex(x,0);       }
 __SDH__ cdouble make_cdouble(int       x) { return make_cuDoubleComplex(x,0);       }
 __SDH__ cdouble make_cdouble(unsigned  x) { return make_cuDoubleComplex(x,0);       }
+__SDH__ cdouble make_cdouble(short     x) { return make_cuDoubleComplex(x,0);       }
+__SDH__ cdouble make_cdouble(ushort    x) { return make_cuDoubleComplex(x,0);       }
 __SDH__ cdouble make_cdouble(float     x) { return make_cuDoubleComplex(x,0);       }
 __SDH__ cdouble make_cdouble(double    x) { return make_cuDoubleComplex(x,0);       }
 __SDH__ cdouble make_cdouble(cdouble   x) { return x;                       }
diff --git a/src/backend/cuda/max.cu b/src/backend/cuda/max.cu
index 78414224c5..c910beaad6 100644
--- a/src/backend/cuda/max.cu
+++ b/src/backend/cuda/max.cu
@@ -22,4 +22,6 @@ namespace cuda
     INSTANTIATE(af_max_t, uintl  , uintl  )
     INSTANTIATE(af_max_t, char   , char   )
     INSTANTIATE(af_max_t, uchar  , uchar  )
+    INSTANTIATE(af_max_t, short  , short  )
+    INSTANTIATE(af_max_t, ushort , ushort )
 }
diff --git a/src/backend/cuda/meanshift.cu b/src/backend/cuda/meanshift.cu
index 0fa1ac3ca3..2e6dcfcc57 100644
--- a/src/backend/cuda/meanshift.cu
+++ b/src/backend/cuda/meanshift.cu
@@ -42,5 +42,9 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
+INSTANTIATE(intl  )
+INSTANTIATE(uintl )
 
 }
diff --git a/src/backend/cuda/medfilt.cu b/src/backend/cuda/medfilt.cu
index 9a99caea01..c87aea4dbe 100644
--- a/src/backend/cuda/medfilt.cu
+++ b/src/backend/cuda/medfilt.cu
@@ -44,5 +44,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp
index 45e410fbba..9b3d731b4b 100644
--- a/src/backend/cuda/memory.cpp
+++ b/src/backend/cuda/memory.cpp
@@ -384,5 +384,7 @@ namespace cuda
     INSTANTIATE(uchar)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/min.cu b/src/backend/cuda/min.cu
index 0251414309..26719de468 100644
--- a/src/backend/cuda/min.cu
+++ b/src/backend/cuda/min.cu
@@ -22,4 +22,6 @@ namespace cuda
     INSTANTIATE(af_min_t, uintl  , uintl  )
     INSTANTIATE(af_min_t, char   , char   )
     INSTANTIATE(af_min_t, uchar  , uchar  )
+    INSTANTIATE(af_min_t, short  , short  )
+    INSTANTIATE(af_min_t, ushort , ushort )
 }
diff --git a/src/backend/cuda/nearest_neighbour.cu b/src/backend/cuda/nearest_neighbour.cu
index 1899c9d378..789c0f5b12 100644
--- a/src/backend/cuda/nearest_neighbour.cu
+++ b/src/backend/cuda/nearest_neighbour.cu
@@ -73,6 +73,8 @@ INSTANTIATE(uint  , uint)
 INSTANTIATE(intl  , intl)
 INSTANTIATE(uintl , uintl)
 INSTANTIATE(uchar , uint)
+INSTANTIATE(short , int)
+INSTANTIATE(ushort, uint)
 
 INSTANTIATE(uintl, uint)    // For Hamming
 
diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp
index 45d32bac7f..c154a7eda1 100644
--- a/src/backend/cuda/platform.cpp
+++ b/src/backend/cuda/platform.cpp
@@ -141,6 +141,11 @@ static inline string toString(T val)
 ///////////////////////////////////////////////////////////////////////////
 // Wrapper Functions
 ///////////////////////////////////////////////////////////////////////////
+int getBackend()
+{
+    return AF_BACKEND_CUDA;
+}
+
 string getInfo()
 {
     ostringstream info;
@@ -269,6 +274,18 @@ int getDeviceNativeId(int device)
     return -1;
 }
 
+int getDeviceIdFromNativeId(int nativeId)
+{
+    DeviceManager& mngr = DeviceManager::getInstance();
+
+    int devId = 0;
+    for(devId = 0; devId < mngr.nDevices; ++devId) {
+        if (nativeId == mngr.cuDevices[devId].nativeId)
+            break;
+    }
+    return devId;
+}
+
 cudaStream_t getStream(int device)
 {
     return DeviceManager::getInstance().streams[device];
@@ -388,3 +405,9 @@ af_err afcu_get_native_id(int* nativeid, int id)
     *nativeid = cuda::getDeviceNativeId(id);
     return AF_SUCCESS;
 }
+
+af_err afcu_set_native_id(int nativeid)
+{
+    cuda::setDevice(cuda::getDeviceIdFromNativeId(nativeid));
+    return AF_SUCCESS;
+}
diff --git a/src/backend/cuda/platform.hpp b/src/backend/cuda/platform.hpp
index a893b0128e..7b649686dc 100644
--- a/src/backend/cuda/platform.hpp
+++ b/src/backend/cuda/platform.hpp
@@ -20,6 +20,8 @@
 namespace cuda
 {
 
+int getBackend();
+
 std::string getInfo();
 
 std::string getDeviceInfo(int device);
@@ -79,6 +81,8 @@ class DeviceManager
 
         friend int getDeviceNativeId(int device);
 
+        friend int getDeviceIdFromNativeId(int nativeId);
+
         friend cudaStream_t getStream(int device);
 
         friend int setDevice(int device);
diff --git a/src/backend/cuda/plot.cu b/src/backend/cuda/plot.cu
index 40a004eae8..20f899323d 100644
--- a/src/backend/cuda/plot.cu
+++ b/src/backend/cuda/plot.cu
@@ -52,6 +52,8 @@ INSTANTIATE(float)
 INSTANTIATE(double)
 INSTANTIATE(int)
 INSTANTIATE(uint)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 INSTANTIATE(uchar)
 
 }
diff --git a/src/backend/cuda/plot3.cu b/src/backend/cuda/plot3.cu
new file mode 100644
index 0000000000..378a6ec27f
--- /dev/null
+++ b/src/backend/cuda/plot3.cu
@@ -0,0 +1,61 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <interopManager.hpp>
+#include <Array.hpp>
+#include <plot3.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <join.hpp>
+#include <reduce.hpp>
+#include <reorder.hpp>
+
+using af::dim4;
+
+namespace cuda
+{
+
+template<typename T>
+void copy_plot3(const Array<T> &P, fg::Plot3* plot3)
+{
+    const T *d_P = P.get();
+
+    InteropManager& intrpMngr = InteropManager::getInstance();
+
+    cudaGraphicsResource *cudaVBOResource = intrpMngr.getBufferResource(plot3);
+    // Map resource. Copy data to VBO. Unmap resource.
+    size_t num_bytes = plot3->size();
+    T* d_vbo = NULL;
+    cudaGraphicsMapResources(1, &cudaVBOResource, 0);
+    cudaGraphicsResourceGetMappedPointer((void **)&d_vbo, &num_bytes, cudaVBOResource);
+    cudaMemcpyAsync(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice,
+               cuda::getStream(cuda::getActiveDeviceId()));
+    cudaGraphicsUnmapResources(1, &cudaVBOResource, 0);
+
+    CheckGL("After cuda resource copy");
+
+    POST_LAUNCH_CHECK();
+}
+
+#define INSTANTIATE(T)  \
+    template void copy_plot3<T>(const Array<T> &P, fg::Plot3* plot3);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(uchar)
+
+}
+
+#endif  // WITH_GRAPHICS
diff --git a/src/backend/cuda/plot3.hpp b/src/backend/cuda/plot3.hpp
new file mode 100644
index 0000000000..3badb331f3
--- /dev/null
+++ b/src/backend/cuda/plot3.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <Array.hpp>
+#include <graphics_common.hpp>
+
+namespace cuda
+{
+    template<typename T>
+    void copy_plot3(const Array<T> &P, fg::Plot3* plot3);
+}
+
+#endif
+
diff --git a/src/backend/cuda/product.cu b/src/backend/cuda/product.cu
index abc5c1f37d..d00e140f49 100644
--- a/src/backend/cuda/product.cu
+++ b/src/backend/cuda/product.cu
@@ -11,7 +11,7 @@
 
 namespace cuda
 {
-    //sum
+    //mul
     INSTANTIATE(af_mul_t, float  , float  )
     INSTANTIATE(af_mul_t, double , double )
     INSTANTIATE(af_mul_t, cfloat , cfloat )
@@ -22,4 +22,6 @@ namespace cuda
     INSTANTIATE(af_mul_t, uintl  , uintl  )
     INSTANTIATE(af_mul_t, char   , int    )
     INSTANTIATE(af_mul_t, uchar  , uint   )
+    INSTANTIATE(af_mul_t, short  , int    )
+    INSTANTIATE(af_mul_t, ushort , uint   )
 }
diff --git a/src/backend/cuda/qr.cu b/src/backend/cuda/qr.cu
index 4654ee6e89..41ad1c2600 100644
--- a/src/backend/cuda/qr.cu
+++ b/src/backend/cuda/qr.cu
@@ -219,6 +219,35 @@ INSTANTIATE_QR(double)
 INSTANTIATE_QR(cdouble)
 }
 
+#elif defined(WITH_CPU_LINEAR_ALGEBRA)
+#include <cpu_lapack/cpu_qr.hpp>
+
+namespace cuda
+{
+
+template<typename T>
+void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
+{
+    return cpu::qr(q, r, t, in);
+}
+
+template<typename T>
+Array<T> qr_inplace(Array<T> &in)
+{
+    return cpu::qr_inplace(in);
+}
+
+#define INSTANTIATE_QR(T)                                                                           \
+    template Array<T> qr_inplace<T>(Array<T> &in);                                                \
+    template void qr<T>(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
+
+INSTANTIATE_QR(float)
+INSTANTIATE_QR(cfloat)
+INSTANTIATE_QR(double)
+INSTANTIATE_QR(cdouble)
+
+}
+
 #else
 namespace cuda
 {
diff --git a/src/backend/cuda/random.cu b/src/backend/cuda/random.cu
index c9e6197f14..07cbdc4d9d 100644
--- a/src/backend/cuda/random.cu
+++ b/src/backend/cuda/random.cu
@@ -44,6 +44,8 @@ namespace cuda
     template Array<uintl>   randu<uintl>   (const af::dim4 &dims);
     template Array<char>    randu<char>    (const af::dim4 &dims);
     template Array<uchar>   randu<uchar>   (const af::dim4 &dims);
+    template Array<short>   randu<short>   (const af::dim4 &dims);
+    template Array<ushort>  randu<ushort>  (const af::dim4 &dims);
 
     template Array<float>   randn<float>   (const af::dim4 &dims);
     template Array<double>  randn<double>  (const af::dim4 &dims);
diff --git a/src/backend/cuda/range.cu b/src/backend/cuda/range.cu
index 9a1a7cd3f0..ace3b1c49d 100644
--- a/src/backend/cuda/range.cu
+++ b/src/backend/cuda/range.cu
@@ -45,4 +45,6 @@ namespace cuda
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/regions.cu b/src/backend/cuda/regions.cu
index 656048c9e9..6b50b71477 100644
--- a/src/backend/cuda/regions.cu
+++ b/src/backend/cuda/regions.cu
@@ -65,5 +65,7 @@ INSTANTIATE(float )
 INSTANTIATE(double)
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/reorder.cu b/src/backend/cuda/reorder.cu
index 2c920e632a..7292fcd6a0 100644
--- a/src/backend/cuda/reorder.cu
+++ b/src/backend/cuda/reorder.cu
@@ -43,5 +43,7 @@ namespace cuda
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/resize.cu b/src/backend/cuda/resize.cu
index dcec9720ad..02d34999e8 100644
--- a/src/backend/cuda/resize.cu
+++ b/src/backend/cuda/resize.cu
@@ -57,4 +57,6 @@ namespace cuda
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/rotate.cu b/src/backend/cuda/rotate.cu
index 24e41d75b3..23c99e13f2 100644
--- a/src/backend/cuda/rotate.cu
+++ b/src/backend/cuda/rotate.cu
@@ -53,4 +53,6 @@ namespace cuda
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/scan.cu b/src/backend/cuda/scan.cu
index a76abc2338..15ee6b4c93 100644
--- a/src/backend/cuda/scan.cu
+++ b/src/backend/cuda/scan.cu
@@ -51,5 +51,7 @@ namespace cuda
     INSTANTIATE(af_add_t, uintl  , uintl  )
     INSTANTIATE(af_add_t, char   , int    )
     INSTANTIATE(af_add_t, uchar  , uint   )
+    INSTANTIATE(af_add_t, short  , int    )
+    INSTANTIATE(af_add_t, ushort , uint   )
     INSTANTIATE(af_notzero_t, char  , uint   )
 }
diff --git a/src/backend/cuda/select.cu b/src/backend/cuda/select.cu
index eb90730354..9697da4821 100644
--- a/src/backend/cuda/select.cu
+++ b/src/backend/cuda/select.cu
@@ -48,4 +48,6 @@ namespace cuda
     INSTANTIATE(uintl  )
     INSTANTIATE(char   )
     INSTANTIATE(uchar  )
+    INSTANTIATE(short  )
+    INSTANTIATE(ushort )
 }
diff --git a/src/backend/cuda/set.cu b/src/backend/cuda/set.cu
index 5b457e1ae0..63501d3f2a 100644
--- a/src/backend/cuda/set.cu
+++ b/src/backend/cuda/set.cu
@@ -117,4 +117,8 @@ namespace cuda
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 }
diff --git a/src/backend/cuda/shift.cu b/src/backend/cuda/shift.cu
index f97eb4aff8..89e78ac145 100644
--- a/src/backend/cuda/shift.cu
+++ b/src/backend/cuda/shift.cu
@@ -41,4 +41,6 @@ namespace cuda
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/sift.cu b/src/backend/cuda/sift.cu
index 3f1e99bd25..f3d36d7dfb 100644
--- a/src/backend/cuda/sift.cu
+++ b/src/backend/cuda/sift.cu
@@ -31,7 +31,8 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
               const Array<T>& in, const unsigned n_layers,
               const float contrast_thr, const float edge_thr,
               const float init_sigma, const bool double_input,
-              const float img_scale, const float feature_ratio)
+              const float img_scale, const float feature_ratio,
+              const bool compute_GLOH)
 {
 #ifdef AF_BUILD_SIFT
     const dim4 dims = in.dims();
@@ -48,7 +49,8 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
     kernel::sift<T, convAccT>(&nfeat_out, &desc_len, &x_out, &y_out, &score_out,
                               &orientation_out, &size_out, &desc_out,
                               in, n_layers, contrast_thr, edge_thr,
-                              init_sigma, double_input, img_scale, feature_ratio);
+                              init_sigma, double_input, img_scale, feature_ratio,
+                              compute_GLOH);
 
     if (nfeat_out > 0) {
         if (x_out == NULL || y_out == NULL || score_out == NULL ||
@@ -70,7 +72,10 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
 
     return nfeat_out;
 #else
-    AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AFF_ERR_NONFREE);
+    if (compute_GLOH)
+        AF_ERROR("ArrayFire was not built with nonfree support, GLOH disabled\n", AF_ERR_NONFREE);
+    else
+        AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AF_ERR_NONFREE);
 #endif
 }
 
@@ -81,7 +86,8 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
                                         const Array<T>& in, const unsigned n_layers,        \
                                         const float contrast_thr, const float edge_thr,     \
                                         const float init_sigma, const bool double_input,    \
-                                        const float img_scale, const float feature_ratio);
+                                        const float img_scale, const float feature_ratio,   \
+                                        const bool compute_GLOH);
 
 INSTANTIATE(float , float )
 INSTANTIATE(double, double)
diff --git a/src/backend/cuda/sift.hpp b/src/backend/cuda/sift.hpp
index c3eda20d78..28b887929a 100644
--- a/src/backend/cuda/sift.hpp
+++ b/src/backend/cuda/sift.hpp
@@ -21,6 +21,7 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
               const Array<T>& in, const unsigned n_layers,
               const float contrast_thr, const float edge_thr,
               const float init_sigma, const bool double_input,
-              const float img_scale, const float feature_ratio);
+              const float img_scale, const float feature_ratio,
+              const bool compute_GLOH);
 
 }
diff --git a/src/backend/cuda/sobel.cu b/src/backend/cuda/sobel.cu
index 6f9b1948c6..ab5a69370d 100644
--- a/src/backend/cuda/sobel.cu
+++ b/src/backend/cuda/sobel.cu
@@ -42,5 +42,7 @@ INSTANTIATE(int   , int)
 INSTANTIATE(uint  , int)
 INSTANTIATE(char  , int)
 INSTANTIATE(uchar , int)
+INSTANTIATE(short , int)
+INSTANTIATE(ushort, int)
 
 }
diff --git a/src/backend/cuda/solve.cu b/src/backend/cuda/solve.cu
index 7077c1fbc3..8008ba13f5 100644
--- a/src/backend/cuda/solve.cu
+++ b/src/backend/cuda/solve.cu
@@ -384,6 +384,37 @@ INSTANTIATE_SOLVE(cdouble)
 
 }
 
+#elif defined(WITH_CPU_LINEAR_ALGEBRA)
+#include<cpu_lapack/cpu_solve.hpp>
+
+namespace cuda
+{
+
+template<typename T>
+Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
+                 const Array<T> &b, const af_mat_prop options)
+{
+    return cpu::solveLU(A, pivot, b, options);
+}
+
+template<typename T>
+Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
+{
+    return cpu::solve(a, b, options);
+}
+
+#define INSTANTIATE_SOLVE(T)                                            \
+    template Array<T> solve<T>(const Array<T> &a, const Array<T> &b,    \
+                               const af_mat_prop options);              \
+    template Array<T> solveLU<T>(const Array<T> &A, const Array<int> &pivot, \
+                                 const Array<T> &b, const af_mat_prop options); \
+
+INSTANTIATE_SOLVE(float)
+INSTANTIATE_SOLVE(cfloat)
+INSTANTIATE_SOLVE(double)
+INSTANTIATE_SOLVE(cdouble)
+}
+
 #else
 namespace cuda
 {
diff --git a/src/backend/cuda/sort.cu b/src/backend/cuda/sort.cu
index dc74b800a4..6d14c0309f 100644
--- a/src/backend/cuda/sort.cu
+++ b/src/backend/cuda/sort.cu
@@ -40,4 +40,8 @@ namespace cuda
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 }
diff --git a/src/backend/cuda/sort_by_key/ascd_s16.cu b/src/backend/cuda/sort_by_key/ascd_s16.cu
new file mode 100644
index 0000000000..d51e9ae671
--- /dev/null
+++ b/src/backend/cuda/sort_by_key/ascd_s16.cu
@@ -0,0 +1,15 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <sort_by_key_impl.hpp>
+
+namespace cuda
+{
+    INSTANTIATE1(short, true)
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_s64.cu b/src/backend/cuda/sort_by_key/ascd_s64.cu
new file mode 100644
index 0000000000..25a1e589f8
--- /dev/null
+++ b/src/backend/cuda/sort_by_key/ascd_s64.cu
@@ -0,0 +1,15 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <sort_by_key_impl.hpp>
+
+namespace cuda
+{
+    INSTANTIATE1(intl, true)
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_u16.cu b/src/backend/cuda/sort_by_key/ascd_u16.cu
new file mode 100644
index 0000000000..e06036abc7
--- /dev/null
+++ b/src/backend/cuda/sort_by_key/ascd_u16.cu
@@ -0,0 +1,15 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <sort_by_key_impl.hpp>
+
+namespace cuda
+{
+    INSTANTIATE1(ushort, true)
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_u64.cu b/src/backend/cuda/sort_by_key/ascd_u64.cu
new file mode 100644
index 0000000000..63eec5fdd4
--- /dev/null
+++ b/src/backend/cuda/sort_by_key/ascd_u64.cu
@@ -0,0 +1,15 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <sort_by_key_impl.hpp>
+
+namespace cuda
+{
+    INSTANTIATE1(uintl, true)
+}
diff --git a/src/backend/cuda/sort_by_key/desc_s16.cu b/src/backend/cuda/sort_by_key/desc_s16.cu
new file mode 100644
index 0000000000..63967b6117
--- /dev/null
+++ b/src/backend/cuda/sort_by_key/desc_s16.cu
@@ -0,0 +1,15 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <sort_by_key_impl.hpp>
+
+namespace cuda
+{
+    INSTANTIATE1(short, false)
+}
diff --git a/src/backend/cuda/sort_by_key/desc_s64.cu b/src/backend/cuda/sort_by_key/desc_s64.cu
new file mode 100644
index 0000000000..a10ee11475
--- /dev/null
+++ b/src/backend/cuda/sort_by_key/desc_s64.cu
@@ -0,0 +1,15 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <sort_by_key_impl.hpp>
+
+namespace cuda
+{
+    INSTANTIATE1(intl, false)
+}
diff --git a/src/backend/cuda/sort_by_key/desc_u16.cu b/src/backend/cuda/sort_by_key/desc_u16.cu
new file mode 100644
index 0000000000..69dc01634b
--- /dev/null
+++ b/src/backend/cuda/sort_by_key/desc_u16.cu
@@ -0,0 +1,15 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <sort_by_key_impl.hpp>
+
+namespace cuda
+{
+    INSTANTIATE1(ushort, false)
+}
diff --git a/src/backend/cuda/sort_by_key/desc_u64.cu b/src/backend/cuda/sort_by_key/desc_u64.cu
new file mode 100644
index 0000000000..43f60c075b
--- /dev/null
+++ b/src/backend/cuda/sort_by_key/desc_u64.cu
@@ -0,0 +1,15 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <sort_by_key_impl.hpp>
+
+namespace cuda
+{
+    INSTANTIATE1(uintl, false)
+}
diff --git a/src/backend/cuda/sort_by_key_impl.hpp b/src/backend/cuda/sort_by_key_impl.hpp
index 32758b47a5..d01ace404e 100644
--- a/src/backend/cuda/sort_by_key_impl.hpp
+++ b/src/backend/cuda/sort_by_key_impl.hpp
@@ -40,6 +40,10 @@ namespace cuda
     INSTANTIATE(Tk, double, dr) \
     INSTANTIATE(Tk, int,    dr) \
     INSTANTIATE(Tk, uint,   dr) \
+    INSTANTIATE(Tk, short,  dr) \
+    INSTANTIATE(Tk, ushort, dr) \
     INSTANTIATE(Tk, char,   dr) \
-    INSTANTIATE(Tk, uchar,  dr)
+    INSTANTIATE(Tk, uchar,  dr) \
+    INSTANTIATE(Tk, intl,   dr) \
+    INSTANTIATE(Tk, uintl,  dr)
 }
diff --git a/src/backend/cuda/sort_index.cu b/src/backend/cuda/sort_index.cu
index b80287b90f..606aab4eb1 100644
--- a/src/backend/cuda/sort_index.cu
+++ b/src/backend/cuda/sort_index.cu
@@ -41,5 +41,9 @@ namespace cuda
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 
 }
diff --git a/src/backend/cuda/sum.cu b/src/backend/cuda/sum.cu
index 407cc98f45..863cf9a7da 100644
--- a/src/backend/cuda/sum.cu
+++ b/src/backend/cuda/sum.cu
@@ -17,9 +17,19 @@ namespace cuda
     INSTANTIATE(af_add_t, cfloat , cfloat )
     INSTANTIATE(af_add_t, cdouble, cdouble)
     INSTANTIATE(af_add_t, int    , int    )
+    INSTANTIATE(af_add_t, int    , float  )
     INSTANTIATE(af_add_t, uint   , uint   )
+    INSTANTIATE(af_add_t, uint   , float  )
     INSTANTIATE(af_add_t, intl   , intl   )
+    INSTANTIATE(af_add_t, intl   , double )
     INSTANTIATE(af_add_t, uintl  , uintl  )
+    INSTANTIATE(af_add_t, uintl  , double )
     INSTANTIATE(af_add_t, char   , int    )
+    INSTANTIATE(af_add_t, char   , float  )
     INSTANTIATE(af_add_t, uchar  , uint   )
+    INSTANTIATE(af_add_t, uchar  , float  )
+    INSTANTIATE(af_add_t, short  , int    )
+    INSTANTIATE(af_add_t, short  , float  )
+    INSTANTIATE(af_add_t, ushort , uint   )
+    INSTANTIATE(af_add_t, ushort , float  )
 }
diff --git a/src/backend/cuda/surface.cu b/src/backend/cuda/surface.cu
new file mode 100644
index 0000000000..fcb9f81975
--- /dev/null
+++ b/src/backend/cuda/surface.cu
@@ -0,0 +1,61 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <interopManager.hpp>
+#include <Array.hpp>
+#include <surface.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <join.hpp>
+#include <reduce.hpp>
+#include <reorder.hpp>
+
+using af::dim4;
+
+namespace cuda
+{
+
+template<typename T>
+void copy_surface(const Array<T> &P, fg::Surface* surface)
+{
+    const T *d_P = P.get();
+
+    InteropManager& intrpMngr = InteropManager::getInstance();
+
+    cudaGraphicsResource *cudaVBOResource = intrpMngr.getBufferResource(surface);
+    // Map resource. Copy data to VBO. Unmap resource.
+    size_t num_bytes = surface->size();
+    T* d_vbo = NULL;
+    cudaGraphicsMapResources(1, &cudaVBOResource, 0);
+    cudaGraphicsResourceGetMappedPointer((void **)&d_vbo, &num_bytes, cudaVBOResource);
+    cudaMemcpyAsync(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice,
+               cuda::getStream(cuda::getActiveDeviceId()));
+    cudaGraphicsUnmapResources(1, &cudaVBOResource, 0);
+
+    CheckGL("After cuda resource copy");
+
+    POST_LAUNCH_CHECK();
+}
+
+#define INSTANTIATE(T)  \
+    template void copy_surface<T>(const Array<T> &P, fg::Surface* surface);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(uchar)
+
+}
+
+#endif  // WITH_GRAPHICS
diff --git a/src/backend/cuda/surface.hpp b/src/backend/cuda/surface.hpp
new file mode 100644
index 0000000000..d7019837e2
--- /dev/null
+++ b/src/backend/cuda/surface.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <Array.hpp>
+#include <graphics_common.hpp>
+
+namespace cuda
+{
+    template<typename T>
+    void copy_surface(const Array<T> &P, fg::Surface* surface);
+}
+
+#endif
+
diff --git a/src/backend/cuda/susan.cu b/src/backend/cuda/susan.cu
index 8474454879..f79e07aa02 100644
--- a/src/backend/cuda/susan.cu
+++ b/src/backend/cuda/susan.cu
@@ -42,14 +42,20 @@ unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
     memFree(resp);
 
     const unsigned corners_out = min(corners_found, corner_lim);
-    if (corners_out == 0)
+    if (corners_out == 0) {
+        memFree(x_corners);
+        memFree(y_corners);
+        memFree(resp_corners);
+        x_out    = createEmptyArray<float>(dim4());
+        y_out    = createEmptyArray<float>(dim4());
+        resp_out = createEmptyArray<float>(dim4());
         return 0;
-
-    x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)x_corners);
-    y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)y_corners);
-    resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)resp_corners);
-
-    return corners_out;
+    } else {
+        x_out    = createDeviceDataArray<float>(dim4(corners_out), (void*)x_corners);
+        y_out    = createDeviceDataArray<float>(dim4(corners_out), (void*)y_corners);
+        resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)resp_corners);
+        return corners_out;
+    }
 }
 
 #define INSTANTIATE(T) \
@@ -63,5 +69,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/svd.cu b/src/backend/cuda/svd.cu
index 37ffa78319..e07c1f0564 100644
--- a/src/backend/cuda/svd.cu
+++ b/src/backend/cuda/svd.cu
@@ -17,13 +17,12 @@
 #include <math.hpp>
 #include <err_common.hpp>
 
-namespace cuda
-{
-
 #if defined(WITH_CUDA_LINEAR_ALGEBRA)
 
 #include <cusolverDnManager.hpp>
 
+namespace cuda
+{
     using cusolver::getDnHandle;
 
     template<typename T>
@@ -124,9 +123,33 @@ SVD_SPECIALIZE(cdouble, double, Z);
             transpose_inplace(u, true);
         }
     }
+}
+#elif defined(WITH_CPU_LINEAR_ALGEBRA)
+
+#include <cpu_lapack/cpu_svd.hpp>
+
+namespace cuda
+{
+
+template<typename T, typename Tr>
+void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+{
+    return cpu::svd<T, Tr>(s, u, vt, in);
+}
+
+template<typename T, typename Tr>
+void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+{
+    return cpu::svdInPlace<T, Tr>(s, u, vt, in);
+}
+
+}
 
 #else
 
+namespace cuda
+{
+
 template<typename T, typename Tr>
 void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
 {
@@ -141,8 +164,13 @@ void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
              AF_ERR_NOT_CONFIGURED);
 }
 
+}
+
 #endif
 
+namespace cuda
+{
+
 #define INSTANTIATE(T, Tr)                                              \
     template void svd<T, Tr>(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in); \
     template void svdInPlace<T, Tr>(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in);
diff --git a/src/backend/cuda/tile.cu b/src/backend/cuda/tile.cu
index 2a9af87820..f15fd87039 100644
--- a/src/backend/cuda/tile.cu
+++ b/src/backend/cuda/tile.cu
@@ -46,5 +46,7 @@ namespace cuda
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/transform.cu b/src/backend/cuda/transform.cu
index 214bce309f..853617c0a4 100644
--- a/src/backend/cuda/transform.cu
+++ b/src/backend/cuda/transform.cu
@@ -55,4 +55,6 @@ namespace cuda
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/transpose.cu b/src/backend/cuda/transpose.cu
index e787b6ede4..fff167a86d 100644
--- a/src/backend/cuda/transpose.cu
+++ b/src/backend/cuda/transpose.cu
@@ -46,5 +46,7 @@ INSTANTIATE(uint   )
 INSTANTIATE(uchar  )
 INSTANTIATE(intl   )
 INSTANTIATE(uintl  )
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cuda/transpose_inplace.cu b/src/backend/cuda/transpose_inplace.cu
index 98613bc846..1d34580d3e 100644
--- a/src/backend/cuda/transpose_inplace.cu
+++ b/src/backend/cuda/transpose_inplace.cu
@@ -37,6 +37,8 @@ INSTANTIATE(uint   )
 INSTANTIATE(uchar  )
 INSTANTIATE(intl   )
 INSTANTIATE(uintl  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 }
 
diff --git a/src/backend/cuda/triangle.cu b/src/backend/cuda/triangle.cu
index 99970a0d72..e92b1d5f65 100644
--- a/src/backend/cuda/triangle.cu
+++ b/src/backend/cuda/triangle.cu
@@ -52,4 +52,6 @@ Array<T> triangle(const Array<T> &in)
     INSTANTIATE(uintl)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/types.cpp b/src/backend/cuda/types.cpp
index f83913bce9..8c29c00b45 100644
--- a/src/backend/cuda/types.cpp
+++ b/src/backend/cuda/types.cpp
@@ -24,6 +24,8 @@ namespace cuda
     template<> const char *cuShortName<uchar   >() { return "h"; }
     template<> const char *cuShortName<intl    >() { return "x"; }
     template<> const char *cuShortName<uintl   >() { return "y"; }
+    template<> const char *cuShortName<short   >() { return "s"; }
+    template<> const char *cuShortName<ushort  >() { return "t"; }
 
     template<typename T > const char *afShortName(bool caps) { return caps ?  "Q" : "q"; }
     template<> const char *afShortName<float   >(bool caps) { return caps ?  "S" : "s"; }
@@ -36,6 +38,8 @@ namespace cuda
     template<> const char *afShortName<uchar   >(bool caps) { return caps ?  "V" : "v"; }
     template<> const char *afShortName<intl    >(bool caps) { return caps ?  "X" : "x"; }
     template<> const char *afShortName<uintl   >(bool caps) { return caps ?  "Y" : "y"; }
+    template<> const char *afShortName<short   >(bool caps) { return caps ?  "P" : "P"; }
+    template<> const char *afShortName<ushort  >(bool caps) { return caps ?  "Q" : "Q"; }
 
     template<typename T > const char *irname() { return  "i32"; }
     template<> const char *irname<float   >() { return  "float"; }
@@ -48,6 +52,8 @@ namespace cuda
     template<> const char *irname<uintl   >() { return  "i64"; }
     template<> const char *irname<char    >() { return  "i8"; }
     template<> const char *irname<uchar   >() { return  "i8"; }
+    template<> const char *irname<short   >() { return  "i16"; }
+    template<> const char *irname<ushort  >() { return  "i16"; }
 
     template <typename T>
     static inline std::string toString(T val)
@@ -89,4 +95,6 @@ namespace cuda
     INSTANTIATE(uint)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/types.hpp b/src/backend/cuda/types.hpp
index 0d807ae364..26d0bb658d 100644
--- a/src/backend/cuda/types.hpp
+++ b/src/backend/cuda/types.hpp
@@ -16,7 +16,8 @@ namespace cuda
     typedef cuFloatComplex   cfloat;
     typedef cuDoubleComplex cdouble;
     typedef unsigned int   uint;
-    typedef unsigned char uchar;
+    typedef unsigned char  uchar;
+    typedef unsigned short ushort;
 
     template<typename T> struct is_complex          { static const bool value = false;  };
     template<> struct           is_complex<cfloat>  { static const bool value = true;   };
diff --git a/src/backend/cuda/unwrap.cu b/src/backend/cuda/unwrap.cu
index 8600ca10e5..a61aba487e 100644
--- a/src/backend/cuda/unwrap.cu
+++ b/src/backend/cuda/unwrap.cu
@@ -54,4 +54,6 @@ namespace cuda
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/cuda/where.cu b/src/backend/cuda/where.cu
index 8e4f9cfe80..a43e339cdd 100644
--- a/src/backend/cuda/where.cu
+++ b/src/backend/cuda/where.cu
@@ -42,5 +42,7 @@ namespace cuda
     INSTANTIATE(intl   )
     INSTANTIATE(uintl  )
     INSTANTIATE(uchar  )
+    INSTANTIATE(short  )
+    INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/cuda/wrap.cu b/src/backend/cuda/wrap.cu
index a1e70fccd3..017a3a41e8 100644
--- a/src/backend/cuda/wrap.cu
+++ b/src/backend/cuda/wrap.cu
@@ -54,4 +54,6 @@ namespace cuda
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/defines.hpp b/src/backend/defines.hpp
index 26898370b3..4308ca952c 100644
--- a/src/backend/defines.hpp
+++ b/src/backend/defines.hpp
@@ -9,6 +9,10 @@
 
 #pragma once
 
+#include <af/macros.h>
+
+#define MSG AF_MSG
+
 #if defined(_WIN32) || defined(_MSC_VER)
     #define __PRETTY_FUNCTION__ __FUNCSIG__
     #if _MSC_VER < 1900
diff --git a/src/backend/dim4.cpp b/src/backend/dim4.cpp
index d1f69d2044..024e3595f1 100644
--- a/src/backend/dim4.cpp
+++ b/src/backend/dim4.cpp
@@ -12,11 +12,11 @@
 #include <cmath>
 #include <cfloat>
 #include <af/dim4.hpp>
-#include <ArrayInfo.hpp>
 #include <err_common.hpp>
 
 namespace af
 {
+
 #if __cplusplus > 199711l
     static_assert(std::is_standard_layout<dim4>::value, "af::dim4 must be a standard layout type");
 #endif
@@ -210,48 +210,4 @@ dim_t calcDim(const af_seq &seq, const dim_t &parentDim)
     return outDim;
 }
 
-} // end namespace af
-
-using af::dim4;
-using std::vector;
-
-dim4
-toDims(const vector<af_seq>& seqs, const dim4 &parentDims)
-{
-    dim4 outDims(1, 1, 1, 1);
-    for(unsigned i = 0; i < seqs.size(); i++ ) {
-        outDims[i] = af::calcDim(seqs[i], parentDims[i]);
-        if (outDims[i] > parentDims[i])
-            AF_ERROR("Size mismatch between input and output", AF_ERR_SIZE);
-    }
-    return outDims;
-}
-
-dim4
-toOffset(const vector<af_seq>& seqs, const dim4 &parentDims)
-{
-    dim4 outOffsets(0, 0, 0, 0);
-    for(unsigned i = 0; i < seqs.size(); i++ ) {
-        if (seqs[i].step !=0 && seqs[i].begin >= 0) {
-            outOffsets[i] = seqs[i].begin;
-        } else if (seqs[i].begin <= -1) {
-            outOffsets[i] = parentDims[i] + seqs[i].begin;
-        } else {
-            outOffsets[i] = 0;
-        }
-
-        if (outOffsets[i] >= parentDims[i])
-            AF_ERROR("Index out of range", AF_ERR_SIZE);
-    }
-    return outOffsets;
-}
-
-dim4
-toStride(const vector<af_seq>& seqs, const af::dim4 &parentDims)
-{
-    dim4 out(calcStrides(parentDims));
-    for(unsigned i = 0; i < seqs.size(); i++ ) {
-        if  (seqs[i].step != 0) {   out[i] *= seqs[i].step; }
-    }
-    return out;
 }
diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp
index 395f3c77aa..466666fa4a 100644
--- a/src/backend/opencl/Array.cpp
+++ b/src/backend/opencl/Array.cpp
@@ -38,7 +38,7 @@ namespace opencl
 
     template<typename T>
     Array<T>::Array(af::dim4 dims, JIT::Node_ptr n) :
-        info(-1, dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
         data(),
         data_dims(dims),
         node(n), offset(0), ready(false), owner(true)
@@ -323,5 +323,7 @@ namespace opencl
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/opencl/all.cpp b/src/backend/opencl/all.cpp
index 4f5c131a35..3c9513db4c 100644
--- a/src/backend/opencl/all.cpp
+++ b/src/backend/opencl/all.cpp
@@ -22,4 +22,6 @@ namespace opencl
     INSTANTIATE(af_and_t, uintl  , char)
     INSTANTIATE(af_and_t, char   , char)
     INSTANTIATE(af_and_t, uchar  , char)
+    INSTANTIATE(af_and_t, short  , char)
+    INSTANTIATE(af_and_t, ushort , char)
 }
diff --git a/src/backend/opencl/any.cpp b/src/backend/opencl/any.cpp
index ee8599daa8..e8c6de51ed 100644
--- a/src/backend/opencl/any.cpp
+++ b/src/backend/opencl/any.cpp
@@ -22,4 +22,6 @@ namespace opencl
     INSTANTIATE(af_or_t, uintl  , char)
     INSTANTIATE(af_or_t, char   , char)
     INSTANTIATE(af_or_t, uchar  , char)
+    INSTANTIATE(af_or_t, short  , char)
+    INSTANTIATE(af_or_t, ushort , char)
 }
diff --git a/src/backend/opencl/assign.cpp b/src/backend/opencl/assign.cpp
index 15d579db0d..903b59b804 100644
--- a/src/backend/opencl/assign.cpp
+++ b/src/backend/opencl/assign.cpp
@@ -78,11 +78,13 @@ INSTANTIATE(cdouble)
 INSTANTIATE(double )
 INSTANTIATE(cfloat )
 INSTANTIATE(float  )
-INSTANTIATE(uintl  )
+INSTANTIATE(int    )
 INSTANTIATE(uint   )
 INSTANTIATE(intl   )
-INSTANTIATE(int    )
+INSTANTIATE(uintl  )
 INSTANTIATE(uchar  )
 INSTANTIATE(char   )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/opencl/bilateral.cpp b/src/backend/opencl/bilateral.cpp
index 1cd54d973b..c1a42ac8fc 100644
--- a/src/backend/opencl/bilateral.cpp
+++ b/src/backend/opencl/bilateral.cpp
@@ -37,5 +37,7 @@ INSTANTIATE(char  ,  float)
 INSTANTIATE(int   ,  float)
 INSTANTIATE(uint  ,  float)
 INSTANTIATE(uchar ,  float)
+INSTANTIATE(short ,  float)
+INSTANTIATE(ushort,  float)
 
 }
diff --git a/src/backend/opencl/convolve.cpp b/src/backend/opencl/convolve.cpp
index b800591b1e..18d719eff6 100644
--- a/src/backend/opencl/convolve.cpp
+++ b/src/backend/opencl/convolve.cpp
@@ -77,5 +77,9 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(uintl  ,   float)
+INSTANTIATE(intl   ,   float)
 
 }
diff --git a/src/backend/opencl/convolve_separable.cpp b/src/backend/opencl/convolve_separable.cpp
index fede1d7d0d..6b52168e7b 100644
--- a/src/backend/opencl/convolve_separable.cpp
+++ b/src/backend/opencl/convolve_separable.cpp
@@ -63,5 +63,9 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(intl   ,   float)
+INSTANTIATE(uintl  ,   float)
 
 }
diff --git a/src/backend/opencl/copy.cpp b/src/backend/opencl/copy.cpp
index 370b072a5d..39cbf4b59d 100644
--- a/src/backend/opencl/copy.cpp
+++ b/src/backend/opencl/copy.cpp
@@ -141,6 +141,8 @@ namespace opencl
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
     #define INSTANTIATE_PAD_ARRAY(SRC_T)                                    \
     template Array<float  > padArray<SRC_T, float  >(Array<SRC_T> const &src, dim4 const &dims, float   default_value, double factor); \
@@ -149,8 +151,10 @@ namespace opencl
     template Array<cdouble> padArray<SRC_T, cdouble>(Array<SRC_T> const &src, dim4 const &dims, cdouble default_value, double factor); \
     template Array<int    > padArray<SRC_T, int    >(Array<SRC_T> const &src, dim4 const &dims, int     default_value, double factor); \
     template Array<uint   > padArray<SRC_T, uint   >(Array<SRC_T> const &src, dim4 const &dims, uint    default_value, double factor); \
-    template Array<intl    > padArray<SRC_T, intl    >(Array<SRC_T> const &src, dim4 const &dims, intl     default_value, double factor); \
-    template Array<uintl   > padArray<SRC_T, uintl   >(Array<SRC_T> const &src, dim4 const &dims, uintl    default_value, double factor); \
+    template Array<intl   > padArray<SRC_T, intl   >(Array<SRC_T> const &src, dim4 const &dims, intl    default_value, double factor); \
+    template Array<uintl  > padArray<SRC_T, uintl  >(Array<SRC_T> const &src, dim4 const &dims, uintl   default_value, double factor); \
+    template Array<short  > padArray<SRC_T, short  >(Array<SRC_T> const &src, dim4 const &dims, short   default_value, double factor); \
+    template Array<ushort > padArray<SRC_T, ushort >(Array<SRC_T> const &src, dim4 const &dims, ushort  default_value, double factor); \
     template Array<uchar  > padArray<SRC_T, uchar  >(Array<SRC_T> const &src, dim4 const &dims, uchar   default_value, double factor); \
     template Array<char   > padArray<SRC_T, char   >(Array<SRC_T> const &src, dim4 const &dims, char    default_value, double factor); \
     template void copyArray<SRC_T, float  >(Array<float  > &dst, Array<SRC_T> const &src); \
@@ -159,8 +163,10 @@ namespace opencl
     template void copyArray<SRC_T, cdouble>(Array<cdouble> &dst, Array<SRC_T> const &src); \
     template void copyArray<SRC_T, int    >(Array<int    > &dst, Array<SRC_T> const &src); \
     template void copyArray<SRC_T, uint   >(Array<uint   > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, intl    >(Array<intl    > &dst, Array<SRC_T> const &src); \
-    template void copyArray<SRC_T, uintl   >(Array<uintl   > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, intl   >(Array<intl   > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, uintl  >(Array<uintl  > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, short  >(Array<short  > &dst, Array<SRC_T> const &src); \
+    template void copyArray<SRC_T, ushort >(Array<ushort > &dst, Array<SRC_T> const &src); \
     template void copyArray<SRC_T, uchar  >(Array<uchar  > &dst, Array<SRC_T> const &src); \
     template void copyArray<SRC_T, char   >(Array<char   > &dst, Array<SRC_T> const &src);
 
@@ -168,10 +174,12 @@ namespace opencl
     INSTANTIATE_PAD_ARRAY(double)
     INSTANTIATE_PAD_ARRAY(int   )
     INSTANTIATE_PAD_ARRAY(uint  )
-    INSTANTIATE_PAD_ARRAY(intl   )
-    INSTANTIATE_PAD_ARRAY(uintl  )
+    INSTANTIATE_PAD_ARRAY(intl  )
+    INSTANTIATE_PAD_ARRAY(uintl )
     INSTANTIATE_PAD_ARRAY(uchar )
     INSTANTIATE_PAD_ARRAY(char  )
+    INSTANTIATE_PAD_ARRAY(short )
+    INSTANTIATE_PAD_ARRAY(ushort)
 
 #define INSTANTIATE_PAD_ARRAY_COMPLEX(SRC_T)                            \
     template Array<cfloat > padArray<SRC_T, cfloat >(Array<SRC_T> const &src, dim4 const &dims, cfloat  default_value, double factor); \
@@ -196,6 +204,8 @@ namespace opencl
     SPECILIAZE_UNUSED_COPYARRAY(cfloat, int)
     SPECILIAZE_UNUSED_COPYARRAY(cfloat, intl)
     SPECILIAZE_UNUSED_COPYARRAY(cfloat, uintl)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat, short)
+    SPECILIAZE_UNUSED_COPYARRAY(cfloat, ushort)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
@@ -204,5 +214,7 @@ namespace opencl
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
     SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
+    SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
+    SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
 
 }
diff --git a/src/backend/opencl/count.cpp b/src/backend/opencl/count.cpp
index e5ad4bf0c5..c1162954ad 100644
--- a/src/backend/opencl/count.cpp
+++ b/src/backend/opencl/count.cpp
@@ -22,4 +22,6 @@ namespace opencl
     INSTANTIATE(af_notzero_t, uintl  , uint)
     INSTANTIATE(af_notzero_t, char   , uint)
     INSTANTIATE(af_notzero_t, uchar  , uint)
+    INSTANTIATE(af_notzero_t, short  , uint)
+    INSTANTIATE(af_notzero_t, ushort , uint)
 }
diff --git a/src/backend/opencl/diagonal.cpp b/src/backend/opencl/diagonal.cpp
index a6d3e2c2dd..79cd758bd5 100644
--- a/src/backend/opencl/diagonal.cpp
+++ b/src/backend/opencl/diagonal.cpp
@@ -57,5 +57,7 @@ namespace opencl
     INSTANTIATE_DIAGONAL(uintl)
     INSTANTIATE_DIAGONAL(char)
     INSTANTIATE_DIAGONAL(uchar)
+    INSTANTIATE_DIAGONAL(short)
+    INSTANTIATE_DIAGONAL(ushort)
 
 }
diff --git a/src/backend/opencl/diff.cpp b/src/backend/opencl/diff.cpp
index cfcd684080..b466b8a739 100644
--- a/src/backend/opencl/diff.cpp
+++ b/src/backend/opencl/diff.cpp
@@ -73,5 +73,7 @@ namespace opencl
     INSTANTIATE(uchar)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
     INSTANTIATE(char)
 }
diff --git a/src/backend/opencl/dilate.cpp b/src/backend/opencl/dilate.cpp
index fbc5b2881d..fff9f99887 100644
--- a/src/backend/opencl/dilate.cpp
+++ b/src/backend/opencl/dilate.cpp
@@ -18,5 +18,7 @@ INSTANTIATE(char  , true)
 INSTANTIATE(int   , true)
 INSTANTIATE(uint  , true)
 INSTANTIATE(uchar , true)
+INSTANTIATE(short , true)
+INSTANTIATE(ushort, true)
 
 }
diff --git a/src/backend/opencl/dilate3d.cpp b/src/backend/opencl/dilate3d.cpp
index 7c8898f175..d519957a63 100644
--- a/src/backend/opencl/dilate3d.cpp
+++ b/src/backend/opencl/dilate3d.cpp
@@ -18,5 +18,7 @@ INSTANTIATE(char  , true)
 INSTANTIATE(int   , true)
 INSTANTIATE(uint  , true)
 INSTANTIATE(uchar , true)
+INSTANTIATE(short , true)
+INSTANTIATE(ushort, true)
 
 }
diff --git a/src/backend/opencl/erode.cpp b/src/backend/opencl/erode.cpp
index bcb1579291..1618802575 100644
--- a/src/backend/opencl/erode.cpp
+++ b/src/backend/opencl/erode.cpp
@@ -18,5 +18,7 @@ INSTANTIATE(char  , false)
 INSTANTIATE(int   , false)
 INSTANTIATE(uint  , false)
 INSTANTIATE(uchar , false)
+INSTANTIATE(short , false)
+INSTANTIATE(ushort, false)
 
 }
diff --git a/src/backend/opencl/erode3d.cpp b/src/backend/opencl/erode3d.cpp
index 71ee3fd504..7ffb423687 100644
--- a/src/backend/opencl/erode3d.cpp
+++ b/src/backend/opencl/erode3d.cpp
@@ -18,5 +18,7 @@ INSTANTIATE(char  , false)
 INSTANTIATE(int   , false)
 INSTANTIATE(uint  , false)
 INSTANTIATE(uchar , false)
+INSTANTIATE(short , false)
+INSTANTIATE(ushort, false)
 
 }
diff --git a/src/backend/opencl/fast.cpp b/src/backend/opencl/fast.cpp
index 5af04a8425..0813595144 100644
--- a/src/backend/opencl/fast.cpp
+++ b/src/backend/opencl/fast.cpp
@@ -57,5 +57,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/opencl/fftconvolve.cpp b/src/backend/opencl/fftconvolve.cpp
index d97f83a8cd..f824f75cae 100644
--- a/src/backend/opencl/fftconvolve.cpp
+++ b/src/backend/opencl/fftconvolve.cpp
@@ -136,5 +136,9 @@ INSTANTIATE(uint  , float,  cfloat,  false, true)
 INSTANTIATE(int   , float,  cfloat,  false, true)
 INSTANTIATE(uchar , float,  cfloat,  false, true)
 INSTANTIATE(char  , float,  cfloat,  false, true)
+INSTANTIATE(ushort, float,  cfloat,  false, true)
+INSTANTIATE(short , float,  cfloat,  false, true)
+INSTANTIATE(uintl , float,  cfloat,  false, true)
+INSTANTIATE(intl  , float,  cfloat,  false, true)
 
 }
diff --git a/src/backend/opencl/hist_graphics.cpp b/src/backend/opencl/hist_graphics.cpp
index cde15a1799..022bcf1aaf 100644
--- a/src/backend/opencl/hist_graphics.cpp
+++ b/src/backend/opencl/hist_graphics.cpp
@@ -60,6 +60,8 @@ void copy_histogram(const Array<T> &data, const fg::Histogram* hist)
 INSTANTIATE(float)
 INSTANTIATE(int)
 INSTANTIATE(uint)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 INSTANTIATE(uchar)
 
 }
diff --git a/src/backend/opencl/histogram.cpp b/src/backend/opencl/histogram.cpp
index 3a7bd7253a..d7de9915fa 100644
--- a/src/backend/opencl/histogram.cpp
+++ b/src/backend/opencl/histogram.cpp
@@ -46,5 +46,9 @@ INSTANTIATE(char  , uint)
 INSTANTIATE(int   , uint)
 INSTANTIATE(uint  , uint)
 INSTANTIATE(uchar , uint)
+INSTANTIATE(short , uint)
+INSTANTIATE(ushort, uint)
+INSTANTIATE(intl  , uint)
+INSTANTIATE(uintl , uint)
 
 }
diff --git a/src/backend/opencl/homography.cpp b/src/backend/opencl/homography.cpp
new file mode 100644
index 0000000000..dbce53b19b
--- /dev/null
+++ b/src/backend/opencl/homography.cpp
@@ -0,0 +1,96 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_opencl.hpp>
+#include <handle.hpp>
+#include <arith.hpp>
+#include <random.hpp>
+#include <kernel/homography.hpp>
+#include <algorithm>
+
+#include <iostream>
+#include <cfloat>
+
+using af::dim4;
+
+namespace opencl
+{
+
+#define RANSACConfidence 0.99f
+#define LMEDSConfidence 0.99f
+#define LMEDSOutlierRatio 0.4f
+
+template<typename T>
+int homography(Array<T> &bestH,
+               const Array<float> &x_src,
+               const Array<float> &y_src,
+               const Array<float> &x_dst,
+               const Array<float> &y_dst,
+               const af_homography_type htype,
+               const float inlier_thr,
+               const unsigned iterations)
+{
+    const af::dim4 idims = x_src.dims();
+    const unsigned nsamples = idims[0];
+
+    unsigned iter = iterations;
+    Array<float> err = createEmptyArray<float>(af::dim4());
+    if (htype == AF_HOMOGRAPHY_LMEDS) {
+        iter = ::std::min(iter, (unsigned)(log(1.f - LMEDSConfidence) / log(1.f - pow(1.f - LMEDSOutlierRatio, 4.f))));
+        err = createValueArray<float>(af::dim4(nsamples, iter), FLT_MAX);
+    }
+    else {
+        // Avoid passing "null" cl_mem object to kernels
+        err = createEmptyArray<float>(af::dim4(1));
+    }
+
+    const size_t iter_sz = divup(iter, 256) * 256;
+
+    af::dim4 rdims(4, iter_sz);
+    Array<float> frnd = randu<float>(rdims);
+    Array<float> fctr = createValueArray<float>(rdims, (float)nsamples);
+    Array<float> rnd = arithOp<float, af_mul_t>(frnd, fctr, rdims);
+
+    Array<T> tmpH = createValueArray<T>(af::dim4(9, iter_sz), (T)0);
+    Array<T> tmpA = createValueArray<T>(af::dim4(9, 9, iter_sz), (T)0);
+    Array<T> tmpV = createValueArray<T>(af::dim4(9, 9, iter_sz), (T)0);
+
+    bestH = createValueArray<T>(af::dim4(3, 3), (T)0);
+    switch (htype) {
+    case AF_HOMOGRAPHY_RANSAC:
+        return kernel::computeH<T, AF_HOMOGRAPHY_RANSAC>(bestH, tmpH, tmpA, tmpV, err,
+                                              x_src, y_src, x_dst, y_dst,
+                                              rnd, iter, nsamples, inlier_thr);
+        break;
+    case AF_HOMOGRAPHY_LMEDS:
+        return kernel::computeH<T, AF_HOMOGRAPHY_LMEDS> (bestH, tmpH, tmpA, tmpV, err,
+                                              x_src, y_src, x_dst, y_dst,
+                                              rnd, iter, nsamples, inlier_thr);
+        break;
+    default:
+        return -1;
+        break;
+    }
+}
+
+#define INSTANTIATE(T)                                                              \
+    template int homography(Array<T> &H,                                            \
+                            const Array<float> &x_src, const Array<float> &y_src,   \
+                            const Array<float> &x_dst, const Array<float> &y_dst,   \
+                            const af_homography_type htype, const float inlier_thr, \
+                            const unsigned iterations);
+
+INSTANTIATE(float )
+INSTANTIATE(double)
+
+}
diff --git a/src/backend/opencl/homography.hpp b/src/backend/opencl/homography.hpp
new file mode 100644
index 0000000000..6c926e50f4
--- /dev/null
+++ b/src/backend/opencl/homography.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+
+template<typename T>
+int homography(Array<T> &H,
+               const Array<float> &x_src, const Array<float> &y_src,
+               const Array<float> &x_dst, const Array<float> &y_dst,
+               const af_homography_type htype, const float inlier_thr,
+               const unsigned iterations);
+
+}
diff --git a/src/backend/opencl/identity.cpp b/src/backend/opencl/identity.cpp
index dd6414027c..4f10a191c5 100644
--- a/src/backend/opencl/identity.cpp
+++ b/src/backend/opencl/identity.cpp
@@ -38,5 +38,7 @@ namespace opencl
     INSTANTIATE_IDENTITY(uintl)
     INSTANTIATE_IDENTITY(char)
     INSTANTIATE_IDENTITY(uchar)
+    INSTANTIATE_IDENTITY(short)
+    INSTANTIATE_IDENTITY(ushort)
 
 }
diff --git a/src/backend/opencl/image.cpp b/src/backend/opencl/image.cpp
index 1ee886b8e2..c758df4953 100644
--- a/src/backend/opencl/image.cpp
+++ b/src/backend/opencl/image.cpp
@@ -45,16 +45,12 @@ void copy_image(const Array<T> &in, const fg::Image* image)
     } else {
         CheckGL("Begin OpenCL fallback-resource copy");
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, image->pbo());
-        CheckGL("1Begin OpenCL fallback-resource copy");
         glBufferData(GL_PIXEL_UNPACK_BUFFER, image->size(), 0, GL_STREAM_DRAW);
-        CheckGL("2Begin OpenCL fallback-resource copy");
         GLubyte* ptr = (GLubyte*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY);
-        CheckGL("3Begin OpenCL fallback-resource copy");
         if (ptr) {
             getQueue().enqueueReadBuffer(*in.get(), CL_TRUE, 0, image->size(), ptr);
             glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
         }
-        CheckGL("4Begin OpenCL fallback-resource copy");
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
         CheckGL("End OpenCL fallback-resource copy");
     }
@@ -69,6 +65,8 @@ INSTANTIATE(int)
 INSTANTIATE(uint)
 INSTANTIATE(uchar)
 INSTANTIATE(char)
+INSTANTIATE(ushort)
+INSTANTIATE(short)
 
 }
 
diff --git a/src/backend/opencl/index.cpp b/src/backend/opencl/index.cpp
index 33dc559b8f..6502ee0f43 100644
--- a/src/backend/opencl/index.cpp
+++ b/src/backend/opencl/index.cpp
@@ -82,11 +82,13 @@ INSTANTIATE(cdouble)
 INSTANTIATE(double )
 INSTANTIATE(cfloat )
 INSTANTIATE(float  )
-INSTANTIATE(uintl  )
+INSTANTIATE(int    )
 INSTANTIATE(uint   )
 INSTANTIATE(intl   )
-INSTANTIATE(int    )
+INSTANTIATE(uintl  )
 INSTANTIATE(uchar  )
 INSTANTIATE(char   )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/opencl/interopManager.cpp b/src/backend/opencl/interopManager.cpp
index 2b6fda0cc7..89487ec9ca 100644
--- a/src/backend/opencl/interopManager.cpp
+++ b/src/backend/opencl/interopManager.cpp
@@ -59,6 +59,18 @@ cl::Buffer* InteropManager::getBufferResource(const fg::Plot* plot)
     return interop_maps[device][key];
 }
 
+cl::Buffer* InteropManager::getBufferResource(const fg::Plot3* plot3)
+{
+    void * key = (void*)plot3;
+    int device = getActiveDeviceId();
+    iter_t iter = interop_maps[device].find(key);
+
+    if (iter == interop_maps[device].end())
+        interop_maps[device][key] = new cl::BufferGL(getContext(), CL_MEM_WRITE_ONLY, plot3->vbo(), NULL);
+
+    return interop_maps[device][key];
+}
+
 cl::Buffer* InteropManager::getBufferResource(const fg::Histogram* hist)
 {
     void * key = (void*)hist;
@@ -71,6 +83,18 @@ cl::Buffer* InteropManager::getBufferResource(const fg::Histogram* hist)
     return interop_maps[device][key];
 }
 
+cl::Buffer* InteropManager::getBufferResource(const fg::Surface* surface)
+{
+    void * key = (void*)surface;
+    int device = getActiveDeviceId();
+    iter_t iter = interop_maps[device].find(key);
+
+    if (iter == interop_maps[device].end())
+        interop_maps[device][key] = new cl::BufferGL(getContext(), CL_MEM_WRITE_ONLY, surface->vbo(), NULL);
+
+    return interop_maps[device][key];
+}
+
 }
 
 #endif
diff --git a/src/backend/opencl/interopManager.hpp b/src/backend/opencl/interopManager.hpp
index 6af6d17ed7..c7a2c25868 100644
--- a/src/backend/opencl/interopManager.hpp
+++ b/src/backend/opencl/interopManager.hpp
@@ -30,7 +30,9 @@ class InteropManager
         ~InteropManager();
         cl::Buffer* getBufferResource(const fg::Image* image);
         cl::Buffer* getBufferResource(const fg::Plot* plot);
+        cl::Buffer* getBufferResource(const fg::Plot3* plot3);
         cl::Buffer* getBufferResource(const fg::Histogram* hist);
+        cl::Buffer* getBufferResource(const fg::Surface* surface);
 
     protected:
         InteropManager() {}
diff --git a/src/backend/opencl/iota.cpp b/src/backend/opencl/iota.cpp
index fb98bca6c4..ac4408c8b4 100644
--- a/src/backend/opencl/iota.cpp
+++ b/src/backend/opencl/iota.cpp
@@ -37,4 +37,6 @@ namespace opencl
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/ireduce.cpp b/src/backend/opencl/ireduce.cpp
index 698137c883..e02c7e55d4 100644
--- a/src/backend/opencl/ireduce.cpp
+++ b/src/backend/opencl/ireduce.cpp
@@ -51,6 +51,8 @@ namespace opencl
     INSTANTIATE(af_min_t, uintl  )
     INSTANTIATE(af_min_t, char   )
     INSTANTIATE(af_min_t, uchar  )
+    INSTANTIATE(af_min_t, short  )
+    INSTANTIATE(af_min_t, ushort )
 
     //max
     INSTANTIATE(af_max_t, float  )
@@ -63,4 +65,6 @@ namespace opencl
     INSTANTIATE(af_max_t, uintl  )
     INSTANTIATE(af_max_t, char   )
     INSTANTIATE(af_max_t, uchar  )
+    INSTANTIATE(af_max_t, short  )
+    INSTANTIATE(af_max_t, ushort )
 }
diff --git a/src/backend/opencl/join.cpp b/src/backend/opencl/join.cpp
index a02fb2fd6a..64a8aaafdf 100644
--- a/src/backend/opencl/join.cpp
+++ b/src/backend/opencl/join.cpp
@@ -179,6 +179,8 @@ namespace opencl
     INSTANTIATE(uint,    uint)
     INSTANTIATE(intl,    intl)
     INSTANTIATE(uintl,   uintl)
+    INSTANTIATE(short,   short)
+    INSTANTIATE(ushort,  ushort)
     INSTANTIATE(uchar,   uchar)
     INSTANTIATE(char,    char)
 
@@ -195,6 +197,8 @@ namespace opencl
     INSTANTIATE(uint)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
 
diff --git a/src/backend/opencl/kernel/approx.hpp b/src/backend/opencl/kernel/approx.hpp
index 6ec637ac9a..d7b5997a9e 100644
--- a/src/backend/opencl/kernel/approx.hpp
+++ b/src/backend/opencl/kernel/approx.hpp
@@ -87,18 +87,21 @@ namespace opencl
 
 
                 auto approx1Op = make_kernel<Buffer, const KParam, const Buffer, const KParam,
-                                       const Buffer, const KParam, const float, const int>
+                                       const Buffer, const KParam, const float, const dim_t, const int>
                                       (*approxKernels[device]);
 
                 NDRange local(THREADS, 1, 1);
-                int blocksPerMat = divup(out.info.dims[0], local[0]);
+                dim_t blocksPerMat = divup(out.info.dims[0], local[0]);
                 NDRange global(blocksPerMat * local[0] * out.info.dims[1],
                                out.info.dims[2] * out.info.dims[3] * local[0],
                                1);
 
+                // Passing bools to opencl kernels is not allowed
+                bool pBatch = !(pos.info.dims[1] == 1 && pos.info.dims[2] == 1 && pos.info.dims[3] == 1);
+
                 approx1Op(EnqueueArgs(getQueue(), global, local),
                           *out.data, out.info, *in.data, in.info,
-                          *pos.data, pos.info, offGrid, blocksPerMat);
+                          *pos.data, pos.info, offGrid, blocksPerMat, (int)pBatch);
 
                 CL_DEBUG_FINISH(getQueue());
             } catch (cl::Error err) {
@@ -152,23 +155,25 @@ namespace opencl
 
                 auto approx2Op = make_kernel<Buffer, const KParam, const Buffer, const KParam,
                                        const Buffer, const KParam, const Buffer, const KParam,
-                                       const float, const int, const int>
+                                       const float, const dim_t, const dim_t, const int>
                                        (*approxKernels[device]);
 
                 NDRange local(TX, TY, 1);
-                int blocksPerMatX = divup(out.info.dims[0], local[0]);
-                int blocksPerMatY = divup(out.info.dims[1], local[1]);
+                dim_t blocksPerMatX = divup(out.info.dims[0], local[0]);
+                dim_t blocksPerMatY = divup(out.info.dims[1], local[1]);
                 NDRange global(blocksPerMatX * local[0] * out.info.dims[2],
                                blocksPerMatY * local[1] * out.info.dims[3],
                                1);
 
+                // Passing bools to opencl kernels is not allowed
+                bool pBatch = !(pos.info.dims[2] == 1 && pos.info.dims[3] == 1);
 
                 approx2Op(EnqueueArgs(getQueue(), global, local),
                           *out.data, out.info,
                           *in.data, in.info,
                           *pos.data, pos.info,
                           *qos.data, qos.info,
-                          offGrid, blocksPerMatX, blocksPerMatY);
+                          offGrid, blocksPerMatX, blocksPerMatY, (int)pBatch);
                 CL_DEBUG_FINISH(getQueue());
             } catch (cl::Error err) {
                 CL_TO_AF_ERROR(err);
diff --git a/src/backend/opencl/kernel/approx1.cl b/src/backend/opencl/kernel/approx1.cl
index 3531e2f83e..5693fc3907 100644
--- a/src/backend/opencl/kernel/approx1.cl
+++ b/src/backend/opencl/kernel/approx1.cl
@@ -32,15 +32,16 @@ Ty div(Ty a, Tp b) { a.x = a.x / b; a.y = a.y / b; return a; }
 ///////////////////////////////////////////////////////////////////////////
 // nearest-neighbor resampling
 ///////////////////////////////////////////////////////////////////////////
-void core_nearest1(const int idx, const int idy, const int idz, const int idw,
+void core_nearest1(const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw,
                    __global       Ty *d_out, const KParam out,
                    __global const Ty *d_in,  const KParam in,
                    __global const Tp *d_pos, const KParam pos,
-                   const float offGrid)
+                   const float offGrid, const bool pBatch)
 {
-    const int omId = idw * out.strides[3] + idz * out.strides[2]
-                   + idy * out.strides[1] + idx;
-    const int pmId = idx;
+    const dim_t omId = idw * out.strides[3] + idz * out.strides[2]
+                     + idy * out.strides[1] + idx;
+    dim_t pmId = idx;
+    if(pBatch) pmId += idw * pos.strides[3] + idz * pos.strides[2] + idy * pos.strides[1];
 
     const Tp pVal = d_pos[pmId];
     if (pVal < 0 || in.dims[0] < pVal+1) {
@@ -48,8 +49,8 @@ void core_nearest1(const int idx, const int idy, const int idz, const int idw,
         return;
     }
 
-    int ioff = idw * in.strides[3] + idz * in.strides[2] + idy * in.strides[1];
-    const int imId = round(pVal) + ioff;
+    dim_t ioff = idw * in.strides[3] + idz * in.strides[2] + idy * in.strides[1];
+    const dim_t imId = round(pVal) + ioff;
 
     Ty y;
     set(y, d_in[imId]);
@@ -59,15 +60,16 @@ void core_nearest1(const int idx, const int idy, const int idz, const int idw,
 ///////////////////////////////////////////////////////////////////////////
 // linear resampling
 ///////////////////////////////////////////////////////////////////////////
-void core_linear1(const int idx, const int idy, const int idz, const int idw,
+void core_linear1(const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw,
                    __global       Ty *d_out, const KParam out,
                    __global const Ty *d_in,  const KParam in,
                    __global const Tp *d_pos, const KParam pos,
-                   const float offGrid)
+                   const float offGrid, const bool pBatch)
 {
-    const int omId = idw * out.strides[3] + idz * out.strides[2]
-                        + idy * out.strides[1] + idx;
-    const int pmId = idx;
+    const dim_t omId = idw * out.strides[3] + idz * out.strides[2]
+                     + idy * out.strides[1] + idx;
+    dim_t pmId = idx;
+    if(pBatch) pmId += idw * pos.strides[3] + idz * pos.strides[2] + idy * pos.strides[1];
 
     const Tp pVal = d_pos[pmId];
     if (pVal < 0 || in.dims[0] < pVal+1) {
@@ -75,10 +77,10 @@ void core_linear1(const int idx, const int idy, const int idz, const int idw,
         return;
     }
 
-    const int grid_x = floor(pVal);  // nearest grid
+    const dim_t grid_x = floor(pVal);  // nearest grid
     const Tp off_x = pVal - grid_x; // fractional offset
 
-    int ioff = idw * in.strides[3] + idz * in.strides[2] + idy * in.strides[1] + grid_x;
+    dim_t ioff = idw * in.strides[3] + idz * in.strides[2] + idy * in.strides[1] + grid_x;
 
     // Check if pVal and pVal + 1 are both valid indices
     bool cond = (pVal < in.dims[0] - 1);
@@ -104,14 +106,14 @@ __kernel
 void approx1_kernel(__global       Ty *d_out, const KParam out,
                     __global const Ty *d_in,  const KParam in,
                     __global const Tp *d_pos, const KParam pos,
-                    const float offGrid, const int blocksMatX)
+                    const float offGrid, const dim_t blocksMatX, const int pBatch)
 {
-    const int idw = get_group_id(1) / out.dims[2];
-    const int idz = get_group_id(1)  - idw * out.dims[2];
+    const dim_t idw = get_group_id(1) / out.dims[2];
+    const dim_t idz = get_group_id(1)  - idw * out.dims[2];
 
-    const int idy = get_group_id(0) / blocksMatX;
-    const int blockIdx_x = get_group_id(0) - idy * blocksMatX;
-    const int idx = get_local_id(0) + blockIdx_x * get_local_size(0);
+    const dim_t idy = get_group_id(0) / blocksMatX;
+    const dim_t blockIdx_x = get_group_id(0) - idy * blocksMatX;
+    const dim_t idx = get_local_id(0) + blockIdx_x * get_local_size(0);
 
     if(idx >= out.dims[0] ||
        idy >= out.dims[1] ||
@@ -119,5 +121,5 @@ void approx1_kernel(__global       Ty *d_out, const KParam out,
        idw >= out.dims[3])
         return;
 
-    INTERP(idx, idy, idz, idw, d_out, out, d_in + in.offset, in, d_pos + pos.offset, pos, offGrid);
+    INTERP(idx, idy, idz, idw, d_out, out, d_in + in.offset, in, d_pos + pos.offset, pos, offGrid, pBatch);
 }
diff --git a/src/backend/opencl/kernel/approx2.cl b/src/backend/opencl/kernel/approx2.cl
index c540e1bc45..1066f55d41 100644
--- a/src/backend/opencl/kernel/approx2.cl
+++ b/src/backend/opencl/kernel/approx2.cl
@@ -32,17 +32,21 @@ Ty div(Ty a, Tp b) { a.x = a.x / b; a.y = a.y / b; return a; }
 ///////////////////////////////////////////////////////////////////////////
 // nearest-neighbor resampling
 ///////////////////////////////////////////////////////////////////////////
-void core_nearest2(const int idx, const int idy, const int idz, const int idw,
+void core_nearest2(const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw,
                    __global       Ty *d_out, const KParam out,
                    __global const Ty *d_in,  const KParam in,
                    __global const Tp *d_pos, const KParam pos,
                    __global const Tp *d_qos, const KParam qos,
-                   const float offGrid)
+                   const float offGrid, const bool pBatch)
 {
-    const int omId = idw * out.strides[3] + idz * out.strides[2]
-                        + idy * out.strides[1] + idx;
-    const int pmId = idy * pos.strides[1] + idx;
-    const int qmId = idy * qos.strides[1] + idx;
+    const dim_t omId = idw * out.strides[3] + idz * out.strides[2]
+                     + idy * out.strides[1] + idx;
+    dim_t pmId = idy * pos.strides[1] + idx;
+    dim_t qmId = idy * qos.strides[1] + idx;
+    if(pBatch) {
+        pmId += idw * pos.strides[3] + idz * pos.strides[2];
+        qmId += idw * qos.strides[3] + idz * qos.strides[2];
+    }
 
     const Tp x = d_pos[pmId], y = d_qos[qmId];
     if (x < 0 || y < 0 || in.dims[0] < x+1 || in.dims[1] < y+1) {
@@ -50,9 +54,9 @@ void core_nearest2(const int idx, const int idy, const int idz, const int idw,
         return;
     }
 
-    const int grid_x = round(x), grid_y = round(y); // nearest grid
-    const int imId = idw * in.strides[3] + idz * in.strides[2]
-                     + grid_y * in.strides[1] + grid_x;
+    const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
+    const dim_t imId = idw * in.strides[3] + idz * in.strides[2]
+                  + grid_y * in.strides[1] + grid_x;
 
     Ty z;
     set(z, d_in[imId]);
@@ -62,17 +66,21 @@ void core_nearest2(const int idx, const int idy, const int idz, const int idw,
 ///////////////////////////////////////////////////////////////////////////
 // linear resampling
 ///////////////////////////////////////////////////////////////////////////
-void core_linear2(const int idx, const int idy, const int idz, const int idw,
+void core_linear2(const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw,
                   __global       Ty *d_out, const KParam out,
                   __global const Ty *d_in,  const KParam in,
                   __global const Tp *d_pos, const KParam pos,
                   __global const Tp *d_qos, const KParam qos,
-                  const float offGrid)
+                  const float offGrid, const bool pBatch)
 {
-    const int omId = idw * out.strides[3] + idz * out.strides[2]
-                        + idy * out.strides[1] + idx;
-    const int pmId = idy * pos.strides[1] + idx;
-    const int qmId = idy * qos.strides[1] + idx;
+    const dim_t omId = idw * out.strides[3] + idz * out.strides[2]
+                     + idy * out.strides[1] + idx;
+    dim_t pmId = idy * pos.strides[1] + idx;
+    dim_t qmId = idy * qos.strides[1] + idx;
+    if(pBatch) {
+        pmId += idw * pos.strides[3] + idz * pos.strides[2];
+        qmId += idw * qos.strides[3] + idz * qos.strides[2];
+    }
 
     const Tp x = d_pos[pmId], y = d_qos[qmId];
     if (x < 0 || y < 0 || in.dims[0] < x+1 || in.dims[1] < y+1) {
@@ -80,10 +88,10 @@ void core_linear2(const int idx, const int idy, const int idz, const int idw,
         return;
     }
 
-    const int grid_x = floor(x),   grid_y = floor(y);   // nearest grid
+    const dim_t grid_x = floor(x),   grid_y = floor(y);   // nearest grid
     const Tp off_x  = x - grid_x, off_y  = y - grid_y; // fractional offset
 
-    int ioff = idw * in.strides[3] + idz * in.strides[2] + grid_y * in.strides[1] + grid_x;
+    dim_t ioff = idw * in.strides[3] + idz * in.strides[2] + grid_y * in.strides[1] + grid_x;
 
     // Check if pVal and pVal + 1 are both valid indices
     bool condY = (y < in.dims[1] - 1);
@@ -118,16 +126,17 @@ void approx2_kernel(__global       Ty *d_out, const KParam out,
                     __global const Ty *d_in,  const KParam in,
                     __global const Tp *d_pos, const KParam pos,
                     __global const Tp *d_qos, const KParam qos,
-                    const float offGrid, const int blocksMatX, const int blocksMatY)
+                    const float offGrid, const dim_t blocksMatX, const dim_t blocksMatY,
+                    const int pBatch)
 {
-    const int idz = get_group_id(0) / blocksMatX;
-    const int idw = get_group_id(1) / blocksMatY;
+    const dim_t idz = get_group_id(0) / blocksMatX;
+    const dim_t idw = get_group_id(1) / blocksMatY;
 
-    const int blockIdx_x = get_group_id(0) - idz * blocksMatX;
-    const int blockIdx_y = get_group_id(1) - idw * blocksMatY;
+    const dim_t blockIdx_x = get_group_id(0) - idz * blocksMatX;
+    const dim_t blockIdx_y = get_group_id(1) - idw * blocksMatY;
 
-    const int idx = get_local_id(0) + blockIdx_x * get_local_size(0);
-    const int idy = get_local_id(1) + blockIdx_y * get_local_size(1);
+    const dim_t idx = get_local_id(0) + blockIdx_x * get_local_size(0);
+    const dim_t idy = get_local_id(1) + blockIdx_y * get_local_size(1);
 
     if(idx >= out.dims[0] ||
        idy >= out.dims[1] ||
@@ -136,5 +145,5 @@ void approx2_kernel(__global       Ty *d_out, const KParam out,
         return;
 
     INTERP(idx, idy, idz, idw, d_out, out, d_in + in.offset, in,
-           d_pos + pos.offset, pos, d_qos + qos.offset, qos, offGrid);
+           d_pos + pos.offset, pos, d_qos + qos.offset, qos, offGrid, pBatch);
 }
diff --git a/src/backend/opencl/kernel/convolve/conv1.cpp b/src/backend/opencl/kernel/convolve/conv1.cpp
index 7ac1123ee6..86329c3c50 100644
--- a/src/backend/opencl/kernel/convolve/conv1.cpp
+++ b/src/backend/opencl/kernel/convolve/conv1.cpp
@@ -62,6 +62,10 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(uintl  ,   float)
+INSTANTIATE(intl   ,   float)
 
 }
 
diff --git a/src/backend/opencl/kernel/convolve/conv2_s16.cpp b/src/backend/opencl/kernel/convolve/conv2_s16.cpp
new file mode 100644
index 0000000000..66b6527e68
--- /dev/null
+++ b/src/backend/opencl/kernel/convolve/conv2_s16.cpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/convolve/conv2_impl.hpp>
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+INSTANTIATE(short, float)
+
+}
+
+}
+
diff --git a/src/backend/opencl/kernel/convolve/conv2_s64.cpp b/src/backend/opencl/kernel/convolve/conv2_s64.cpp
new file mode 100644
index 0000000000..1bd4b53a42
--- /dev/null
+++ b/src/backend/opencl/kernel/convolve/conv2_s64.cpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/convolve/conv2_impl.hpp>
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+INSTANTIATE(intl, float)
+
+}
+
+}
+
diff --git a/src/backend/opencl/kernel/convolve/conv2_u16.cpp b/src/backend/opencl/kernel/convolve/conv2_u16.cpp
new file mode 100644
index 0000000000..419e1a64b4
--- /dev/null
+++ b/src/backend/opencl/kernel/convolve/conv2_u16.cpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/convolve/conv2_impl.hpp>
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+INSTANTIATE(ushort, float)
+
+}
+
+}
+
diff --git a/src/backend/opencl/kernel/convolve/conv2_u64.cpp b/src/backend/opencl/kernel/convolve/conv2_u64.cpp
new file mode 100644
index 0000000000..62fe737cb5
--- /dev/null
+++ b/src/backend/opencl/kernel/convolve/conv2_u64.cpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/convolve/conv2_impl.hpp>
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+INSTANTIATE(uintl, float)
+
+}
+
+}
+
diff --git a/src/backend/opencl/kernel/convolve/conv3.cpp b/src/backend/opencl/kernel/convolve/conv3.cpp
index 844a79f65b..3c9645d32e 100644
--- a/src/backend/opencl/kernel/convolve/conv3.cpp
+++ b/src/backend/opencl/kernel/convolve/conv3.cpp
@@ -47,6 +47,10 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(uintl  ,   float)
+INSTANTIATE(intl   ,   float)
 
 }
 
diff --git a/src/backend/opencl/kernel/convolve_separable.cpp b/src/backend/opencl/kernel/convolve_separable.cpp
index e546cc483c..73dd220a5b 100644
--- a/src/backend/opencl/kernel/convolve_separable.cpp
+++ b/src/backend/opencl/kernel/convolve_separable.cpp
@@ -125,6 +125,10 @@ INSTANTIATE(uint   ,   float)
 INSTANTIATE(int    ,   float)
 INSTANTIATE(uchar  ,   float)
 INSTANTIATE(char   ,   float)
+INSTANTIATE(ushort ,   float)
+INSTANTIATE(short  ,   float)
+INSTANTIATE(uintl  ,   float)
+INSTANTIATE(intl   ,   float)
 
 }
 
diff --git a/src/backend/opencl/kernel/fast.hpp b/src/backend/opencl/kernel/fast.hpp
index fcc5a6c58f..1a1354fe4a 100644
--- a/src/backend/opencl/kernel/fast.hpp
+++ b/src/backend/opencl/kernel/fast.hpp
@@ -95,7 +95,7 @@ void fast(const unsigned arc_length,
 
         cl::Buffer *d_flags = d_score;
         if (nonmax) {
-            d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T));
+            d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(float));
         }
 
         const int blk_x = divup(in.info.dims[0]-edge*2, FAST_THREADS_X);
diff --git a/src/backend/opencl/kernel/homography.cl b/src/backend/opencl/kernel/homography.cl
new file mode 100644
index 0000000000..618cb28d7d
--- /dev/null
+++ b/src/backend/opencl/kernel/homography.cl
@@ -0,0 +1,516 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+inline T sq(T a)
+{
+    return a * a;
+}
+
+inline void jacobi_svd(__global T* S, __global T* V, int m, int n,
+                       __local T* l_acc1, __local T* l_acc2, __local T* l_S,
+                       __local T* l_V, __local T* l_d)
+{
+    const int iterations = 30;
+
+    int tid_x = get_local_id(0);
+    int bsz_x = get_local_size(0);
+    int tid_y = get_local_id(1);
+    int gid_y = get_global_id(1);
+
+    for (int k = 0; k <= 4; k++)
+        l_S[tid_y * 81 + k*bsz_x + tid_x] = S[gid_y * 81 + k*bsz_x + tid_x];
+    if (tid_x == 0)
+        l_S[tid_y * 81 + 80] = S[gid_y * 81 + 80];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Copy first 80 elements
+    T t = l_S[tid_y*81 + tid_x];
+    l_acc1[tid_y*bsz_x + tid_x] = t*t;
+    for (int i = 1; i <= 4; i++) {
+        T t = l_S[tid_y*81 + tid_x+i*bsz_x];
+        l_acc1[tid_y*bsz_x + tid_x] += t*t;
+    }
+    if (tid_x < 8)
+        l_acc1[tid_y*16 + tid_x] += l_acc1[tid_y*16 + tid_x+8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_x < 4)
+        l_acc1[tid_y*16 + tid_x] += l_acc1[tid_y*16 + tid_x+4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_x < 2)
+        l_acc1[tid_y*16 + tid_x] += l_acc1[tid_y*16 + tid_x+2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_x < 1) {
+        // Copy last element
+        T t = l_S[tid_y*bsz_x + tid_x+80];
+        l_acc1[tid_y*16 + tid_x] += l_acc1[tid_y*16 + tid_x+1] + t*t;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid_x < n)
+        l_d[tid_y*9 + tid_x] = l_acc1[tid_y*bsz_x + tid_x];
+
+    // V is initialized as an identity matrix
+    for (int i = 0; i <= 4; i++) {
+        l_V[tid_y*81 + i*bsz_x + tid_x] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_x < m)
+        l_V[tid_y*81 + tid_x*m + tid_x] = 1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int it = 0; it < iterations; it++) {
+        int converged = 0;
+
+        for (int i = 0; i < n-1; i++) {
+            for (int j = i+1; j < n; j++) {
+                __local T* Si = l_S + tid_y*81 + i*m;
+                __local T* Sj = l_S + tid_y*81 + j*m;
+
+                T p = (T)0;
+                for (int k = 0; k < m; k++)
+                    p += Si[k]*Sj[k];
+
+                T c = 0, s = 0;
+
+                int cond = (fabs(p) > EPS*sqrt(l_d[tid_y*9 + i]*l_d[tid_y*9 + j]));
+                if (cond) {
+                    T y = l_d[tid_y*9 + i] - l_d[tid_y*9 + j];
+                    T r = hypot(p*2, y);
+                    T r2 = r*2;
+                    if (y >= 0) {
+                        c = sqrt((r + y) / r2);
+                        s = p / (r2*c);
+                    }
+                    else {
+                        s = sqrt((r - y) / r2);
+                        c = p / (r2*s);
+                    }
+
+                    if (tid_x < m) {
+                        T t0 = c*Si[tid_x] + s*Sj[tid_x];
+                        T t1 = c*Sj[tid_x] - s*Si[tid_x];
+                        Si[tid_x] = t0;
+                        Sj[tid_x] = t1;
+
+                        l_acc1[tid_y*16 + tid_x] = t0*t0;
+                        l_acc2[tid_y*16 + tid_x] = t1*t1;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (cond && tid_x < 4) {
+                    l_acc1[tid_y*16 + tid_x] += l_acc1[tid_y*16 + tid_x+4];
+                    l_acc2[tid_y*16 + tid_x] += l_acc2[tid_y*16 + tid_x+4];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if (cond && tid_x < 2) {
+                    l_acc1[tid_y*16 + tid_x] += l_acc1[tid_y*16 + tid_x+2];
+                    l_acc2[tid_y*16 + tid_x] += l_acc2[tid_y*16 + tid_x+2];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if (cond && tid_x < 1) {
+                    l_acc1[tid_y*16 + tid_x] += l_acc1[tid_y*16 + tid_x+1] + l_acc1[tid_y*16 + tid_x+8];
+                    l_acc2[tid_y*16 + tid_x] += l_acc2[tid_y*16 + tid_x+1] + l_acc2[tid_y*16 + tid_x+8];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (cond && tid_x == 0) {
+                    l_d[tid_y*9 + i] = l_acc1[tid_y*16];
+                    l_d[tid_y*9 + j] = l_acc2[tid_y*16];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                __local T* Vi = l_V + tid_y*81 + i*n;
+                __local T* Vj = l_V + tid_y*81 + j*n;
+
+                if (cond && tid_x < n) {
+                    T t0 = Vi[tid_x] * c + Vj[tid_x] * s;
+                    T t1 = Vj[tid_x] * c - Vi[tid_x] * s;
+
+                    Vi[tid_x] = t0;
+                    Vj[tid_x] = t1;
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                converged = 1;
+            }
+            if (converged == 0)
+                break;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int i = 0; i <= 4; i++)
+        V[gid_y * 81 + tid_x+i*bsz_x] = l_V[tid_y * 81 + tid_x+i*bsz_x];
+    if (tid_x == 0)
+        V[gid_y * 81 + 80] = l_V[tid_y * 81 + 80];
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline int compute_mean_scale(
+    float* x_src_mean,
+    float* y_src_mean,
+    float* x_dst_mean,
+    float* y_dst_mean,
+    float* src_scale,
+    float* dst_scale,
+    float* src_pt_x,
+    float* src_pt_y,
+    float* dst_pt_x,
+    float* dst_pt_y,
+    __global const float* x_src,
+    __global const float* y_src,
+    __global const float* x_dst,
+    __global const float* y_dst,
+    __global const float* rnd,
+    KParam rInfo,
+    int i)
+{
+    const unsigned ridx = rInfo.dims[0] * i;
+    unsigned r[4] = { (unsigned)rnd[ridx],
+                      (unsigned)rnd[ridx+1],
+                      (unsigned)rnd[ridx+2],
+                      (unsigned)rnd[ridx+3] };
+
+    // If one of the points is repeated, it's a bad samples, will still
+    // compute homography to ensure all threads pass barrier()
+    int bad = (r[0] == r[1] || r[0] == r[2] || r[0] == r[3] ||
+               r[1] == r[2] || r[1] == r[3] || r[2] == r[3]);
+
+    for (unsigned j = 0; j < 4; j++) {
+        src_pt_x[j] = x_src[r[j]];
+        src_pt_y[j] = y_src[r[j]];
+        dst_pt_x[j] = x_dst[r[j]];
+        dst_pt_y[j] = y_dst[r[j]];
+    }
+
+    *x_src_mean = (src_pt_x[0] + src_pt_x[1] + src_pt_x[2] + src_pt_x[3]) / 4.f;
+    *y_src_mean = (src_pt_y[0] + src_pt_y[1] + src_pt_y[2] + src_pt_y[3]) / 4.f;
+    *x_dst_mean = (dst_pt_x[0] + dst_pt_x[1] + dst_pt_x[2] + dst_pt_x[3]) / 4.f;
+    *y_dst_mean = (dst_pt_y[0] + dst_pt_y[1] + dst_pt_y[2] + dst_pt_y[3]) / 4.f;
+
+    float src_var = 0.0f, dst_var = 0.0f;
+    for (unsigned j = 0; j < 4; j++) {
+        src_var += sq(src_pt_x[j] - *x_src_mean) + sq(src_pt_y[j] - *y_src_mean);
+        dst_var += sq(dst_pt_x[j] - *x_dst_mean) + sq(dst_pt_y[j] - *y_dst_mean);
+    }
+
+    src_var /= 4.f;
+    dst_var /= 4.f;
+
+    *src_scale = sqrt(2.0f) / sqrt(src_var);
+    *dst_scale = sqrt(2.0f) / sqrt(dst_var);
+
+    return !bad;
+}
+
+#define APTR(Z, Y, X) (A[(Z) * AInfo.dims[0] * AInfo.dims[1] + (Y) * AInfo.dims[0] + (X)])
+
+__kernel void compute_homography(
+    __global T* H,
+    KParam HInfo,
+    __global T* A,
+    KParam AInfo,
+    __global T* V,
+    KParam VInfo,
+    __global const float* x_src,
+    __global const float* y_src,
+    __global const float* x_dst,
+    __global const float* y_dst,
+    __global const float* rnd,
+    KParam rInfo,
+    const unsigned iterations)
+{
+    unsigned i = get_global_id(1);
+
+    float x_src_mean, y_src_mean;
+    float x_dst_mean, y_dst_mean;
+    float src_scale, dst_scale;
+    float src_pt_x[4], src_pt_y[4], dst_pt_x[4], dst_pt_y[4];
+
+    compute_mean_scale(&x_src_mean, &y_src_mean,
+                       &x_dst_mean, &y_dst_mean,
+                       &src_scale, &dst_scale,
+                       src_pt_x, src_pt_y,
+                       dst_pt_x, dst_pt_y,
+                       x_src, y_src, x_dst, y_dst,
+                       rnd, rInfo, i);
+
+    // Compute input matrix
+    for (unsigned j = get_local_id(0); j < 4; j+=get_local_size(0)) {
+        float srcx = (src_pt_x[j] - x_src_mean) * src_scale;
+        float srcy = (src_pt_y[j] - y_src_mean) * src_scale;
+        float dstx = (dst_pt_x[j] - x_dst_mean) * dst_scale;
+        float dsty = (dst_pt_y[j] - y_dst_mean) * dst_scale;
+
+        APTR(i, 3, j*2) = -srcx;
+        APTR(i, 4, j*2) = -srcy;
+        APTR(i, 5, j*2) = -1.0f;
+        APTR(i, 6, j*2) = dsty*srcx;
+        APTR(i, 7, j*2) = dsty*srcy;
+        APTR(i, 8, j*2) = dsty;
+
+        APTR(i, 0, j*2+1) = srcx;
+        APTR(i, 1, j*2+1) = srcy;
+        APTR(i, 2, j*2+1) = 1.0f;
+        APTR(i, 6, j*2+1) = -dstx*srcx;
+        APTR(i, 7, j*2+1) = -dstx*srcy;
+        APTR(i, 8, j*2+1) = -dstx;
+    }
+
+    __local T l_acc1[256];
+    __local T l_acc2[256];
+
+    __local T l_S[16*81];
+    __local T l_V[16*81];
+    __local T l_d[16*9];
+
+    jacobi_svd(A, V, 9, 9, l_acc1, l_acc2, l_S, l_V, l_d);
+
+    T vH[9], H_tmp[9];
+    for (unsigned j = 0; j < 9; j++)
+        vH[j] = V[i * VInfo.dims[0] * VInfo.dims[1] + 8 * VInfo.dims[0] + j];
+
+    H_tmp[0] = src_scale*x_dst_mean*vH[6] + src_scale*vH[0]/dst_scale;
+    H_tmp[1] = src_scale*x_dst_mean*vH[7] + src_scale*vH[1]/dst_scale;
+    H_tmp[2] = x_dst_mean*(vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6]) +
+                          (vH[2] - src_scale*y_src_mean*vH[1] - src_scale*x_src_mean*vH[0])/dst_scale;
+
+    H_tmp[3] = src_scale*y_dst_mean*vH[6] + src_scale*vH[3]/dst_scale;
+    H_tmp[4] = src_scale*y_dst_mean*vH[7] + src_scale*vH[4]/dst_scale;
+    H_tmp[5] = y_dst_mean*(vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6]) +
+                          (vH[5] - src_scale*y_src_mean*vH[4] - src_scale*x_src_mean*vH[3])/dst_scale;
+
+    H_tmp[6] = src_scale*vH[6];
+    H_tmp[7] = src_scale*vH[7];
+    H_tmp[8] = vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6];
+
+    const unsigned Hidx = HInfo.dims[0] * i;
+    __global T* H_ptr = H + Hidx;
+    for (int h = 0; h < 9; h++)
+        H_ptr[h] = H_tmp[h];
+}
+
+#undef APTR
+
+// LMedS: http://research.microsoft.com/en-us/um/people/zhang/INRIA/Publis/Tutorial-Estim/node25.html
+__kernel void eval_homography(
+    __global unsigned* inliers,
+    __global unsigned* idx,
+    __global T* H,
+    KParam HInfo,
+    __global float* err,
+    KParam eInfo,
+    __global const float* x_src,
+    __global const float* y_src,
+    __global const float* x_dst,
+    __global const float* y_dst,
+    __global const float* rnd,
+    const unsigned iterations,
+    const unsigned nsamples,
+    const float inlier_thr)
+{
+    unsigned tid_x = get_local_id(0);
+    unsigned i = get_global_id(0);
+
+    __local unsigned l_inliers[256];
+    __local unsigned l_idx[256];
+
+    l_inliers[tid_x] = 0;
+    l_idx[tid_x]     = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (i < iterations) {
+        const unsigned Hidx = HInfo.dims[0] * i;
+        __global T* H_ptr = H + Hidx;
+        T H_tmp[9];
+        for (int h = 0; h < 9; h++)
+            H_tmp[h] = H_ptr[h];
+
+#ifdef RANSAC
+        // Compute inliers
+        unsigned inliers_count = 0;
+        for (unsigned j = 0; j < nsamples; j++) {
+            float z =  H_tmp[6]*x_src[j] + H_tmp[7]*y_src[j] + H_tmp[8];
+            float x = (H_tmp[0]*x_src[j] + H_tmp[1]*y_src[j] + H_tmp[2]) / z;
+            float y = (H_tmp[3]*x_src[j] + H_tmp[4]*y_src[j] + H_tmp[5]) / z;
+
+            float dist = sq(x_dst[j] - x) + sq(y_dst[j] - y);
+            if (dist < inlier_thr*inlier_thr)
+                inliers_count++;
+        }
+
+        l_inliers[tid_x] = inliers_count;
+        l_idx[tid_x]     = i;
+#endif
+#ifdef LMEDS
+        // Compute error
+        for (unsigned j = 0; j < nsamples; j++) {
+            float z =  H_tmp[6]*x_src[j] + H_tmp[7]*y_src[j] + H_tmp[8];
+            float x = (H_tmp[0]*x_src[j] + H_tmp[1]*y_src[j] + H_tmp[2]) / z;
+            float y = (H_tmp[3]*x_src[j] + H_tmp[4]*y_src[j] + H_tmp[5]) / z;
+
+            float dist = sq(x_dst[j] - x) + sq(y_dst[j] - y);
+            err[i*eInfo.dims[0] + j] = sqrt(dist);
+        }
+#endif
+    }
+
+#ifdef RANSAC
+    unsigned bid_x = get_group_id(0);
+
+    // Find sample with most inliers
+    for (unsigned tx = 128; tx > 0; tx >>= 1) {
+        if (tid_x < tx) {
+            if (l_inliers[tid_x + tx] > l_inliers[tid_x]) {
+                l_inliers[tid_x] = l_inliers[tid_x + tx];
+                l_idx[tid_x]     = l_idx[tid_x + tx];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    inliers[bid_x] = l_inliers[0];
+    idx[bid_x]     = l_idx[0];
+#endif
+}
+
+__kernel void compute_median(
+    __global float* median,
+    __global unsigned* idx,
+    __global const float* err,
+    KParam eInfo,
+    const unsigned iterations)
+{
+    const unsigned tid = get_local_id(0);
+    const unsigned bid = get_group_id(0);
+    const unsigned i = get_global_id(0);
+
+    __local float l_median[256];
+    __local unsigned l_idx[256];
+
+    l_median[tid] = FLT_MAX;
+    l_idx[tid] = 0;
+
+    if (i < iterations) {
+        const int nsamples = eInfo.dims[0];
+        float m = err[i*nsamples + nsamples / 2];
+        if (nsamples % 2 == 0)
+            m = (m + err[i*nsamples + nsamples / 2 - 1]) * 0.5f;
+
+        l_idx[tid] = i;
+        l_median[tid] = m;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t) {
+            if (l_median[tid + t] < l_median[tid]) {
+                l_median[tid] = l_median[tid + t];
+                l_idx[tid]    = l_idx[tid + t];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    median[bid] = l_median[0];
+    idx[bid] = l_idx[0];
+}
+
+#define DIVUP(A, B) (((A) + (B) - 1) / (B))
+
+__kernel void find_min_median(
+    __global float* minMedian,
+    __global unsigned* minIdx,
+    __global const float* median,
+    KParam mInfo,
+    __global const unsigned* idx)
+{
+    const unsigned tid = get_local_id(0);
+
+    __local float l_minMedian[256];
+    __local unsigned l_minIdx[256];
+
+    l_minMedian[tid] = FLT_MAX;
+    l_minIdx[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const int loop = DIVUP(mInfo.dims[0], get_local_size(0));
+
+    for (int i = 0; i < loop; i++) {
+        int j = i * get_local_size(0) + tid;
+        if (j < mInfo.dims[0] && median[j] < l_minMedian[tid]) {
+            l_minMedian[tid] = median[j];
+            l_minIdx[tid] = idx[j];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t) {
+            if (l_minMedian[tid + t] < l_minMedian[tid]) {
+                l_minMedian[tid] = l_minMedian[tid + t];
+                l_minIdx[tid]    = l_minIdx[tid + t];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    *minMedian = l_minMedian[0];
+    *minIdx = l_minIdx[0];
+}
+
+#undef DIVUP
+
+__kernel void compute_lmeds_inliers(
+    __global unsigned* inliers,
+    __global const T* H,
+    __global const float* x_src,
+    __global const float* y_src,
+    __global const float* x_dst,
+    __global const float* y_dst,
+    const float minMedian,
+    const unsigned nsamples)
+{
+    unsigned tid = get_local_id(0);
+    unsigned bid = get_group_id(0);
+    unsigned i = get_global_id(0);
+
+    __local T l_H[9];
+    __local unsigned l_inliers[256];
+
+    l_inliers[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 9)
+        l_H[tid] = H[tid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float sigma = fmax(1.4826f * (1 + 5.f/(nsamples - 4)) * (float)sqrt(minMedian), 1e-6f);
+    float dist_thr = sq(2.5f * sigma);
+
+    if (i < nsamples) {
+        float z =  l_H[6]*x_src[i] + l_H[7]*y_src[i] + l_H[8];
+        float x = (l_H[0]*x_src[i] + l_H[1]*y_src[i] + l_H[2]) / z;
+        float y = (l_H[3]*x_src[i] + l_H[4]*y_src[i] + l_H[5]) / z;
+
+        float dist = sq(x_dst[i] - x) + sq(y_dst[i] - y);
+        if (dist <= dist_thr)
+            l_inliers[tid] = 1;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t)
+            l_inliers[tid] += l_inliers[tid + t];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    inliers[bid] = l_inliers[0];
+}
diff --git a/src/backend/opencl/kernel/homography.hpp b/src/backend/opencl/kernel/homography.hpp
new file mode 100644
index 0000000000..714070353b
--- /dev/null
+++ b/src/backend/opencl/kernel/homography.hpp
@@ -0,0 +1,261 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/defines.h>
+#include <dispatch.hpp>
+#include <err_opencl.hpp>
+#include <debug_opencl.hpp>
+#include <memory.hpp>
+#include <kernel_headers/homography.hpp>
+#include <kernel/ireduce.hpp>
+#include <kernel/reduce.hpp>
+#include <kernel/sort.hpp>
+#include <cfloat>
+
+using cl::Buffer;
+using cl::Program;
+using cl::Kernel;
+using cl::EnqueueArgs;
+using cl::LocalSpaceArg;
+using cl::NDRange;
+using std::vector;
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+const int HG_THREADS_X = 16;
+const int HG_THREADS_Y = 16;
+const int HG_THREADS   = 256;
+
+template<typename T, af_homography_type htype>
+int computeH(
+    Param bestH,
+    Param H,
+    Param A,
+    Param V,
+    Param err,
+    Param x_src,
+    Param y_src,
+    Param x_dst,
+    Param y_dst,
+    Param rnd,
+    const unsigned iterations,
+    const unsigned nsamples,
+    const float inlier_thr)
+{
+    try {
+        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+        static std::map<int, Program*> hgProgs;
+        static std::map<int, Kernel*>  chKernel;
+        static std::map<int, Kernel*>  ehKernel;
+        static std::map<int, Kernel*>  cmKernel;
+        static std::map<int, Kernel*>  fmKernel;
+        static std::map<int, Kernel*>  clKernel;
+
+        int device = getActiveDeviceId();
+
+        std::call_once( compileFlags[device], [device] () {
+
+                std::ostringstream options;
+                options << " -D T=" << dtype_traits<T>::getName();
+
+                if (std::is_same<T, double>::value) {
+                    options << " -D USE_DOUBLE";
+                    options << " -D EPS=" << DBL_EPSILON;
+                } else
+                    options << " -D EPS=" << FLT_EPSILON;
+
+                if (htype == AF_HOMOGRAPHY_RANSAC)
+                    options << " -D RANSAC";
+                else if (htype == AF_HOMOGRAPHY_LMEDS)
+                    options << " -D LMEDS";
+
+                cl::Program prog;
+                buildProgram(prog, homography_cl, homography_cl_len, options.str());
+                hgProgs[device] = new Program(prog);
+
+                chKernel[device] = new Kernel(*hgProgs[device], "compute_homography");
+                ehKernel[device] = new Kernel(*hgProgs[device], "eval_homography");
+                cmKernel[device] = new Kernel(*hgProgs[device], "compute_median");
+                fmKernel[device] = new Kernel(*hgProgs[device], "find_min_median");
+                clKernel[device] = new Kernel(*hgProgs[device], "compute_lmeds_inliers");
+            });
+
+        const int blk_x_ch = 1;
+        const int blk_y_ch = divup(iterations, HG_THREADS_Y);
+        const NDRange local_ch(HG_THREADS_X, HG_THREADS_Y);
+        const NDRange global_ch(blk_x_ch * HG_THREADS_X, blk_y_ch * HG_THREADS_Y);
+
+        // Build linear system and solve SVD
+        auto chOp = make_kernel<Buffer, KParam, Buffer, KParam,
+                                Buffer, KParam,
+                                Buffer, Buffer, Buffer, Buffer,
+                                Buffer, KParam, unsigned>(*chKernel[device]);
+
+        chOp(EnqueueArgs(getQueue(), global_ch, local_ch),
+             *H.data, H.info, *A.data, A.info,
+             *V.data, V.info,
+             *x_src.data, *y_src.data, *x_dst.data, *y_dst.data,
+             *rnd.data, rnd.info, iterations);
+        CL_DEBUG_FINISH(getQueue());
+
+        const int blk_x_eh = divup(iterations, HG_THREADS);
+        const NDRange local_eh(HG_THREADS);
+        const NDRange global_eh(blk_x_eh * HG_THREADS);
+
+        // Allocate some temporary buffers
+        Param inliers, idx, median;
+        inliers.info.offset = idx.info.offset = median.info.offset = 0;
+        inliers.info.dims[0] = (htype == AF_HOMOGRAPHY_RANSAC) ? blk_x_eh : divup(nsamples, HG_THREADS);
+        inliers.info.strides[0] = 1;
+        idx.info.dims[0] = median.info.dims[0] = blk_x_eh;
+        idx.info.strides[0] = median.info.strides[0] = 1;
+        for (int k = 1; k < 4; k++) {
+            inliers.info.dims[k] = 1;
+            inliers.info.strides[k] = inliers.info.dims[k-1] * inliers.info.strides[k-1];
+            idx.info.dims[k] = median.info.dims[k] = 1;
+            idx.info.strides[k] = median.info.strides[k] = idx.info.dims[k-1] * idx.info.strides[k-1];
+        }
+        idx.data = bufferAlloc(idx.info.dims[3] * idx.info.strides[3] * sizeof(unsigned));
+        inliers.data = bufferAlloc(inliers.info.dims[3] * inliers.info.strides[3] * sizeof(unsigned));
+        if (htype == AF_HOMOGRAPHY_LMEDS)
+            median.data = bufferAlloc(median.info.dims[3] * median.info.strides[3] * sizeof(float));
+        else
+            median.data = bufferAlloc(sizeof(float));
+
+        // Compute (and for RANSAC, evaluate) homographies
+        auto ehOp = make_kernel<Buffer, Buffer, Buffer, KParam,
+                                Buffer, KParam,
+                                Buffer, Buffer, Buffer, Buffer,
+                                Buffer, unsigned, unsigned, float>(*ehKernel[device]);
+
+        ehOp(EnqueueArgs(getQueue(), global_eh, local_eh),
+             *inliers.data, *idx.data, *H.data, H.info,
+             *err.data, err.info,
+             *x_src.data, *y_src.data, *x_dst.data, *y_dst.data,
+             *rnd.data, iterations, nsamples, inlier_thr);
+        CL_DEBUG_FINISH(getQueue());
+
+        unsigned inliersH, idxH;
+        if (htype == AF_HOMOGRAPHY_LMEDS) {
+            // TODO: Improve this sorting, if the number of iterations is
+            // sufficiently large, this can be *very* slow
+            kernel::sort0<float, true>(err);
+
+            unsigned minIdx;
+            float minMedian;
+
+            // Compute median of every iteration
+            auto cmOp = make_kernel<Buffer, Buffer, Buffer, KParam,
+                                    unsigned>(*cmKernel[device]);
+
+            cmOp(EnqueueArgs(getQueue(), global_eh, local_eh),
+                 *median.data, *idx.data, *err.data, err.info,
+                 iterations);
+            CL_DEBUG_FINISH(getQueue());
+
+            // Reduce medians, only in case iterations > 256
+            if (blk_x_eh > 1) {
+                const NDRange local_fm(HG_THREADS);
+                const NDRange global_fm(HG_THREADS);
+
+                cl::Buffer* finalMedian = bufferAlloc(sizeof(float));
+                cl::Buffer* finalIdx = bufferAlloc(sizeof(unsigned));
+
+                auto fmOp = make_kernel<Buffer, Buffer, Buffer, KParam,
+                                        Buffer>(*fmKernel[device]);
+
+                fmOp(EnqueueArgs(getQueue(), global_fm, local_fm),
+                     *finalMedian, *finalIdx, *median.data, median.info,
+                     *idx.data);
+                CL_DEBUG_FINISH(getQueue());
+
+                getQueue().enqueueReadBuffer(*finalMedian, CL_TRUE, 0, sizeof(float), &minMedian);
+                getQueue().enqueueReadBuffer(*finalIdx, CL_TRUE, 0, sizeof(unsigned), &minIdx);
+
+                bufferFree(finalMedian);
+                bufferFree(finalIdx);
+            }
+            else {
+                getQueue().enqueueReadBuffer(*median.data, CL_TRUE, 0, sizeof(float), &minMedian);
+                getQueue().enqueueReadBuffer(*idx.data, CL_TRUE, 0, sizeof(unsigned), &minIdx);
+            }
+
+            // Copy best homography to output
+            getQueue().enqueueCopyBuffer(*H.data, *bestH.data, minIdx*9*sizeof(T), 0, 9*sizeof(T));
+
+            const int blk_x_cl = divup(nsamples, HG_THREADS);
+            const NDRange local_cl(HG_THREADS);
+            const NDRange global_cl(blk_x_cl * HG_THREADS);
+
+            auto clOp = make_kernel<Buffer, Buffer,
+                                    Buffer, Buffer, Buffer, Buffer,
+                                    float, unsigned>(*clKernel[device]);
+
+            clOp(EnqueueArgs(getQueue(), global_cl, local_cl),
+                 *inliers.data, *bestH.data,
+                 *x_src.data, *y_src.data, *x_dst.data, *y_dst.data,
+                 minMedian, nsamples);
+            CL_DEBUG_FINISH(getQueue());
+
+            // Adds up the total number of inliers
+            Param totalInliers;
+            totalInliers.info.offset = 0;
+            for (int k = 0; k < 4; k++)
+                totalInliers.info.dims[k] = totalInliers.info.strides[k] = 1;
+            totalInliers.data = bufferAlloc(sizeof(unsigned));
+
+            kernel::reduce<unsigned, unsigned, af_add_t>(totalInliers, inliers, 0, false, 0.0);
+
+            getQueue().enqueueReadBuffer(*totalInliers.data, CL_TRUE, 0, sizeof(unsigned), &inliersH);
+
+            bufferFree(totalInliers.data);
+        }
+        else if (htype == AF_HOMOGRAPHY_RANSAC) {
+            Param bestInliers, bestIdx;
+            bestInliers.info.offset = bestIdx.info.offset = 0;
+            for (int k = 0; k < 4; k++) {
+                bestInliers.info.dims[k] = bestIdx.info.dims[k] = 1;
+                bestInliers.info.strides[k] = bestIdx.info.strides[k] = 1;
+            }
+            bestInliers.data = bufferAlloc(sizeof(unsigned));
+            bestIdx.data = bufferAlloc(sizeof(unsigned));
+
+            kernel::ireduce<unsigned, af_max_t>(bestInliers, bestIdx.data, inliers, 0);
+
+            unsigned blockIdx;
+            getQueue().enqueueReadBuffer(*bestIdx.data, CL_TRUE, 0, sizeof(unsigned), &blockIdx);
+
+            // Copies back index and number of inliers of best homography estimation
+            getQueue().enqueueReadBuffer(*idx.data, CL_TRUE, blockIdx*sizeof(unsigned), sizeof(unsigned), &idxH);
+            getQueue().enqueueReadBuffer(*bestInliers.data, CL_TRUE, 0, sizeof(unsigned), &inliersH);
+
+            getQueue().enqueueCopyBuffer(*H.data, *bestH.data, idxH*9*sizeof(T), 0, 9*sizeof(T));
+
+            bufferFree(bestInliers.data);
+            bufferFree(bestIdx.data);
+        }
+
+        bufferFree(inliers.data);
+        bufferFree(idx.data);
+        bufferFree(median.data);
+
+        return (int)inliersH;
+    } catch (cl::Error err) {
+        CL_TO_AF_ERROR(err);
+        throw;
+    }
+}
+
+} // namespace kernel
+
+} // namespace cuda
diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp
index be46a597e3..871370d63b 100644
--- a/src/backend/opencl/kernel/orb.hpp
+++ b/src/backend/opencl/kernel/orb.hpp
@@ -484,7 +484,6 @@ void orb(unsigned* out_feat,
             getQueue().enqueueCopyBuffer(*d_score_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
             getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
             getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
-
             getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*8*sizeof(unsigned), feat_pyr[i] * 8 * sizeof(unsigned));
 
             bufferFree(d_x_pyr[i]);
diff --git a/src/backend/opencl/kernel/sift_nonfree.cl b/src/backend/opencl/kernel/sift_nonfree.cl
index 7a65ffa249..dc968d4f4d 100644
--- a/src/backend/opencl/kernel/sift_nonfree.cl
+++ b/src/backend/opencl/kernel/sift_nonfree.cl
@@ -100,6 +100,8 @@
 // factor used to convert floating-point descriptor to unsigned char
 #define INT_DESCR_FCTR 512.f
 
+__constant float GLOHRadii[3] = {6.f, 11.f, 15.f};
+
 #define PI_VAL 3.14159265358979323846f
 
 void gaussianElimination(float* A, float* b, float* x, const int n)
@@ -193,6 +195,58 @@ inline void normalizeDesc(
     barrier(CLK_LOCAL_MEM_FENCE);
 }
 
+inline void normalizeGLOHDesc(
+    __local float* desc,
+    __local float* accum,
+    const int histlen,
+    int lid_x,
+    int lid_y,
+    int lsz_x)
+{
+    for (int i = lid_x; i < histlen; i += lsz_x)
+        accum[i] = desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float sum = 0.0f;
+    for (int i = 0; i < histlen; i++)
+        sum += desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid_x < 128)
+        accum[lid_x] += accum[lid_x+128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid_x < 64)
+        accum[lid_x] += accum[lid_x+64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid_x < 32)
+        accum[lid_x] += accum[lid_x+32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid_x < 16)
+        // GLOH is 272-dimensional, accumulating last 16 descriptors
+        accum[lid_x] += accum[lid_x+16] + accum[lid_x+256];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid_x < 8)
+        accum[lid_x] += accum[lid_x+8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid_x < 4)
+        accum[lid_x] += accum[lid_x+4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid_x < 2)
+        accum[lid_x] += accum[lid_x+2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid_x < 1)
+        accum[lid_x] += accum[lid_x+1];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float len_sq = accum[0];
+    float len_inv = 1.0f / sqrt(len_sq);
+
+    for (int i = lid_x; i < histlen; i += lsz_x) {
+        desc[lid_y*histlen+i] *= len_inv;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
 __kernel void sub(
     __global T* out,
     __global const T* in,
@@ -201,8 +255,10 @@ __kernel void sub(
 {
     unsigned i = get_global_id(0);
 
-    for (unsigned l = 0; l < n_layers; l++)
-        out[l*nel + i] = in[l*nel + i] - in[(l+1)*nel + i];
+    if (i < nel) {
+        for (unsigned l = 0; l < n_layers; l++)
+            out[l*nel + i] = in[l*nel + i] - in[(l+1)*nel + i];
+    }
 }
 
 #define LCPTR(Y, X) (l_center[(Y) * l_i + (X)])
@@ -689,10 +745,8 @@ __kernel void computeDescriptor(
     __local float* desc = l_mem;
     __local float* accum = l_mem + desc_len * histsz;
 
-    const int histlen = d*d*n;
-
-    for (int i = lid_x; i < histlen*histsz; i += lsz_x)
-        desc[lid_y*histlen+i] = 0.f;
+    for (int i = lid_x; i < desc_len*histsz; i += lsz_x)
+        desc[lid_y*desc_len+i] = 0.f;
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (f < total_feat) {
@@ -787,13 +841,13 @@ __kernel void computeDescriptor(
         desc[l] += desc[l+desc_len];
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+    normalizeDesc(desc, accum, desc_len, lid_x, lid_y, lsz_x);
 
     for (int i = lid_x; i < d*d*n; i += lsz_x)
         desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+    normalizeDesc(desc, accum, desc_len, lid_x, lid_y, lsz_x);
 
     if (f < total_feat) {
         // Calculate final descriptor values
@@ -802,4 +856,165 @@ __kernel void computeDescriptor(
     }
 }
 
+__kernel void computeGLOHDescriptor(
+    __global float* desc_out,
+    const unsigned desc_len,
+    const unsigned histsz,
+    __global const float* x_in,
+    __global const float* y_in,
+    __global const unsigned* layer_in,
+    __global const float* response_in,
+    __global const float* size_in,
+    __global const float* ori_in,
+    const unsigned total_feat,
+    __global const T* gauss_octave,
+    KParam iGauss,
+    const int d,
+    const unsigned rb,
+    const unsigned ab,
+    const unsigned hb,
+    const float scale,
+    const int n_layers,
+    __local float* l_mem)
+{
+    const int lid_x = get_local_id(0);
+    const int lid_y = get_local_id(1);
+    const int lsz_x = get_local_size(0);
+
+    const int f = get_global_id(1);
+
+    __local float* desc = l_mem;
+    __local float* accum = l_mem + desc_len * histsz;
+
+    for (int i = lid_x; i < desc_len*histsz; i += lsz_x)
+        desc[lid_y*desc_len+i] = 0.f;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (f < total_feat) {
+        const unsigned layer = layer_in[f];
+        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+        const float size = size_in[f];
+        const int fx = round(x_in[f] * scale);
+        const int fy = round(y_in[f] * scale);
+
+        // Points img to correct Gaussian pyramid layer
+        const int dim0 = iGauss.dims[0];
+        const int dim1 = iGauss.dims[1];
+        __global const T* img = gauss_octave + (layer * dim0 * dim1);
+
+        float cos_t = cos(ori);
+        float sin_t = sin(ori);
+        float hist_bins_per_rad = hb / (PI_VAL * 2.f);
+        float polar_bins_per_rad = ab / (PI_VAL * 2.f);
+        float exp_denom = GLOHRadii[rb-1] * 0.5f;
+
+        float hist_width = DESCR_SCL_FCTR * size * scale * 0.5f;
+
+        // Keep same descriptor radius used for SIFT
+        int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
+
+        // Alternative radius size calculation, changing the radius weight
+        // (rw) in the range of 0.25f-0.75f gives different results,
+        // increasing it tends to show a better recall rate but with a
+        // smaller amount of correct matches
+        //float rw = 0.5f;
+        //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
+
+        int len = radius*2+1;
+        const int hist_off = (lid_x % histsz) * desc_len;
+
+        // Calculate orientation histogram
+        for (int l = lid_x; l < len*len; l += lsz_x) {
+            int i = l / len - radius;
+            int j = l % len - radius;
+
+            int y = fy + i;
+            int x = fx + j;
+
+            float x_rot = (j * cos_t - i * sin_t);
+            float y_rot = (j * sin_t + i * cos_t);
+
+            float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
+            float theta = atan2(y_rot, x_rot);
+            while (theta < 0.0f)
+                theta += PI_VAL*2;
+            while (theta >= PI_VAL*2)
+                theta -= PI_VAL*2;
+
+            float tbin = theta * polar_bins_per_rad;
+            float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
+                         ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
+                         min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
+
+            if (r <= GLOHRadii[rb-1] &&
+                y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
+                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+                float grad_mag = sqrt(dx*dx + dy*dy);
+                float grad_ori = atan2(dy, dx) - ori;
+                while (grad_ori < 0.0f)
+                    grad_ori += PI_VAL*2;
+                while (grad_ori >= PI_VAL*2)
+                    grad_ori -= PI_VAL*2;
+
+                float w = exp(-r / exp_denom);
+                float obin = grad_ori * hist_bins_per_rad;
+                float mag = grad_mag*w;
+
+                int t0 = floor(tbin);
+                int r0 = floor(rbin);
+                int o0 = floor(obin);
+                tbin -= t0;
+                rbin -= r0;
+                obin -= o0;
+
+                for (int rl = 0; rl <= 1; rl++) {
+                    int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
+                    float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
+                    if (rb >= 0 && rb <= 2) {
+                        for (int tl = 0; tl <= 1; tl++) {
+                            int tb = (t0 + tl) % ab;
+                            float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+                            for (int ol = 0; ol <= 1; ol++) {
+                                int ob = (o0 + ol) % hb;
+                                float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
+                                unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
+		                        fatomic_add(&desc[hist_off + lid_y*desc_len + idx], v_o);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Combine histograms (reduces previous atomicAdd overhead)
+    for (int l = lid_x; l < desc_len*4; l += lsz_x)
+        desc[l] += desc[l+4*desc_len];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int l = lid_x; l < desc_len*2; l += lsz_x)
+        desc[l    ] += desc[l+2*desc_len];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int l = lid_x; l < desc_len; l += lsz_x)
+        desc[l] += desc[l+desc_len];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    normalizeGLOHDesc(desc, accum, desc_len, lid_x, lid_y, lsz_x);
+
+    for (int i = lid_x; i < desc_len; i += lsz_x)
+        desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    normalizeGLOHDesc(desc, accum, desc_len, lid_x, lid_y, lsz_x);
+
+    if (f < total_feat) {
+        // Calculate final descriptor values
+        for (int k = lid_x; k < desc_len; k += lsz_x)
+            desc_out[f*desc_len+k] = round(min(255.f, desc[lid_y*desc_len+k] * INT_DESCR_FCTR));
+    }
+}
+
 #undef IPTR
diff --git a/src/backend/opencl/kernel/sift_nonfree.hpp b/src/backend/opencl/kernel/sift_nonfree.hpp
index 3d9dfce2e7..c28f432fce 100644
--- a/src/backend/opencl/kernel/sift_nonfree.hpp
+++ b/src/backend/opencl/kernel/sift_nonfree.hpp
@@ -130,6 +130,15 @@ static const int DescrHistBins = 8;
 // default number of bins in histogram for orientation assignment
 static const int OriHistBins = 36;
 
+// Number of GLOH bins in radial direction
+static const unsigned GLOHRadialBins = 3;
+
+// Number of GLOH angular bins (excluding the inner-most radial section)
+static const unsigned GLOHAngularBins = 8;
+
+// Number of GLOH bins per histogram in descriptor
+static const unsigned GLOHHistBins = 16;
+
 static const float PI_VAL = 3.14159265358979323846f;
 
 template<typename T>
@@ -403,7 +412,8 @@ void sift(unsigned* out_feat,
           const float init_sigma,
           const bool double_input,
           const float img_scale,
-          const float feature_ratio)
+          const float feature_ratio,
+          const bool compute_GLOH)
 {
     try {
         static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
@@ -414,6 +424,7 @@ void sift(unsigned* out_feat,
         static std::map<int, Kernel*>  coKernel;
         static std::map<int, Kernel*>  rdKernel;
         static std::map<int, Kernel*>  cdKernel;
+        static std::map<int, Kernel*>  cgKernel;
 
         int device = getActiveDeviceId();
 
@@ -437,6 +448,7 @@ void sift(unsigned* out_feat,
                 coKernel[device] = new Kernel(*siftProgs[device], "calcOrientation");
                 rdKernel[device] = new Kernel(*siftProgs[device], "removeDuplicates");
                 cdKernel[device] = new Kernel(*siftProgs[device], "computeDescriptor");
+                cgKernel[device] = new Kernel(*siftProgs[device], "computeGLOHDescriptor");
             });
 
         const unsigned min_dim = (double_input) ? min(img.info.dims[0]*2, img.info.dims[1]*2)
@@ -460,7 +472,10 @@ void sift(unsigned* out_feat,
 
         const unsigned d = DescrWidth;
         const unsigned n = DescrHistBins;
-        const unsigned desc_len = d*d*n;
+        const unsigned rb = GLOHRadialBins;
+        const unsigned ab = GLOHAngularBins;
+        const unsigned hb = GLOHHistBins;
+        const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
 
         cl::Buffer* d_count = bufferAlloc(sizeof(unsigned));
 
@@ -676,17 +691,32 @@ void sift(unsigned* out_feat,
 
             const unsigned histsz = 8;
 
-            auto cdOp = make_kernel<Buffer, unsigned, unsigned,
-                                    Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
-                                    Buffer, KParam, int, int, float, int,
-                                    LocalSpaceArg> (*cdKernel[device]);
-
-            cdOp(EnqueueArgs(getQueue(), global_desc, local_desc),
-                 *d_desc, desc_len, histsz,
-                 *d_oriented_x, *d_oriented_y, *d_oriented_layer,
-                 *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
-                 *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, n_layers,
-                 cl::Local(desc_len * (histsz+1) * sizeof(float)));
+            if (compute_GLOH) {
+                auto cgOp = make_kernel<Buffer, unsigned, unsigned,
+                                        Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
+                                        Buffer, KParam, int, unsigned, unsigned, unsigned, float, int,
+                                        LocalSpaceArg> (*cgKernel[device]);
+
+                cgOp(EnqueueArgs(getQueue(), global_desc, local_desc),
+                     *d_desc, desc_len, histsz,
+                     *d_oriented_x, *d_oriented_y, *d_oriented_layer,
+                     *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
+                     *gauss_pyr[o].data, gauss_pyr[o].info, d, rb, ab, hb, scale, n_layers,
+                     cl::Local(desc_len * (histsz+1) * sizeof(float)));
+            }
+            else {
+                auto cdOp = make_kernel<Buffer, unsigned, unsigned,
+                                        Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
+                                        Buffer, KParam, int, int, float, int,
+                                        LocalSpaceArg> (*cdKernel[device]);
+
+                cdOp(EnqueueArgs(getQueue(), global_desc, local_desc),
+                     *d_desc, desc_len, histsz,
+                     *d_oriented_x, *d_oriented_y, *d_oriented_layer,
+                     *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
+                     *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, n_layers,
+                     cl::Local(desc_len * (histsz+1) * sizeof(float)));
+            }
             CL_DEBUG_FINISH(getQueue());
 
             total_feat += oriented_feat;
@@ -771,7 +801,6 @@ void sift(unsigned* out_feat,
             getQueue().enqueueCopyBuffer(*d_response_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
             getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
             getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
-
             getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*desc_len*sizeof(unsigned), feat_pyr[i] * desc_len * sizeof(unsigned));
 
             bufferFree(d_x_pyr[i]);
diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp
index 58345b9ab8..013d8c53a9 100644
--- a/src/backend/opencl/kernel/sort.hpp
+++ b/src/backend/opencl/kernel/sort.hpp
@@ -38,9 +38,15 @@ namespace opencl
 {
     namespace kernel
     {
-        // Kernel Launch Config Values
-        static const int TX = 32;
-        static const int TY = 8;
+        using std::conditional;
+        using std::is_same;
+        template<typename T>
+        using ltype_t = typename conditional<is_same<T, intl>::value, cl_long, T>::type;
+
+        template<typename T>
+        using type_t = typename conditional<is_same<T, uintl>::value,
+                                            cl_ulong, ltype_t<T>
+                                           >::type;
 
         template<typename T, bool isAscending>
         void sort0(Param val)
@@ -60,14 +66,14 @@ namespace opencl
 
                             if(isAscending) {
                                 compute::stable_sort(
-                                        compute::make_buffer_iterator<T>(val_buf, valOffset),
-                                        compute::make_buffer_iterator<T>(val_buf, valOffset + val.info.dims[0]),
-                                        compute::less<T>(), c_queue);
+                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
+                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
+                                        compute::less< type_t<T> >(), c_queue);
                             } else {
                                 compute::stable_sort(
-                                        compute::make_buffer_iterator<T>(val_buf, valOffset),
-                                        compute::make_buffer_iterator<T>(val_buf, valOffset + val.info.dims[0]),
-                                        compute::greater<T>(), c_queue);
+                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
+                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
+                                        compute::greater< type_t<T> >(), c_queue);
                             }
                         }
                     }
diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp
index 1ea2a48ca2..0cb9cb042d 100644
--- a/src/backend/opencl/kernel/sort_by_key.hpp
+++ b/src/backend/opencl/kernel/sort_by_key.hpp
@@ -38,9 +38,15 @@ namespace opencl
 {
     namespace kernel
     {
-        // Kernel Launch Config Values
-        static const int TX = 32;
-        static const int TY = 8;
+        using std::conditional;
+        using std::is_same;
+        template<typename T>
+        using ltype_t = typename conditional<is_same<T, intl>::value, cl_long, T>::type;
+
+        template<typename T>
+        using type_t = typename conditional<is_same<T, uintl>::value,
+                                            cl_ulong, ltype_t<T>
+                                           >::type;
 
         template<typename Tk, typename Tv, bool isAscending>
         void sort0_by_key(Param okey, Param oval)
@@ -62,14 +68,14 @@ namespace opencl
                             int okeyOffset = okeyWZ + y * okey.info.strides[1];
                             int ovalOffset = ovalWZ + y * oval.info.strides[1];
 
-                            compute::buffer_iterator<Tk> start= compute::make_buffer_iterator<Tk>(okey_buf, okeyOffset);
-                            compute::buffer_iterator<Tk> end = compute::make_buffer_iterator<Tk>(okey_buf, okeyOffset + okey.info.dims[0]);
-                            compute::buffer_iterator<Tv> vals = compute::make_buffer_iterator<Tv>(oval_buf, ovalOffset);
+                            compute::buffer_iterator< type_t<Tk> > start= compute::make_buffer_iterator< type_t<Tk> >(okey_buf, okeyOffset);
+                            compute::buffer_iterator< type_t<Tk> > end = compute::make_buffer_iterator< type_t<Tk> >(okey_buf, okeyOffset + okey.info.dims[0]);
+                            compute::buffer_iterator< type_t<Tv> > vals = compute::make_buffer_iterator< type_t<Tv> >(oval_buf, ovalOffset);
                             if(isAscending) {
                                 compute::sort_by_key(start, end, vals, c_queue);
                             } else {
                                 compute::sort_by_key(start, end, vals,
-                                                     compute::greater<Tk>(), c_queue);
+                                                     compute::greater< type_t<Tk> >(), c_queue);
                             }
                         }
                     }
diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp
index 5595b8c7be..3a8ab1401e 100644
--- a/src/backend/opencl/kernel/sort_index.hpp
+++ b/src/backend/opencl/kernel/sort_index.hpp
@@ -39,6 +39,16 @@ namespace opencl
 {
     namespace kernel
     {
+        using std::conditional;
+        using std::is_same;
+        template<typename T>
+        using ltype_t = typename conditional<is_same<T, intl>::value, cl_long, T>::type;
+
+        template<typename T>
+        using type_t = typename conditional<is_same<T, uintl>::value,
+                                            cl_ulong, ltype_t<T>
+                                           >::type;
+
         template<typename T, bool isAscending>
         void sort0_index(Param val, Param idx)
         {
@@ -64,14 +74,14 @@ namespace opencl
 
                             if(isAscending) {
                                 compute::sort_by_key(
-                                        compute::make_buffer_iterator<T>(val_buf, valOffset),
-                                        compute::make_buffer_iterator<T>(val_buf, valOffset + val.info.dims[0]),
-                                        idx_begin, compute::less<T>(), c_queue);
+                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
+                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
+                                        idx_begin, compute::less< type_t<T> >(), c_queue);
                             } else {
                                 compute::sort_by_key(
-                                        compute::make_buffer_iterator<T>(val_buf, valOffset),
-                                        compute::make_buffer_iterator<T>(val_buf, valOffset + val.info.dims[0]),
-                                        idx_begin, compute::greater<T>(), c_queue);
+                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
+                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
+                                        idx_begin, compute::greater< type_t<T> >(), c_queue);
                             }
                         }
                     }
diff --git a/src/backend/opencl/lookup.cpp b/src/backend/opencl/lookup.cpp
index e9dc4a3f8c..761200fdef 100644
--- a/src/backend/opencl/lookup.cpp
+++ b/src/backend/opencl/lookup.cpp
@@ -44,6 +44,10 @@ Array<in_t> lookup(const Array<in_t> &input, const Array<idx_t> &indices, const
     template Array<T> lookup<T, double  >(const Array<T> &input, const Array<double  > &indices, const unsigned dim); \
     template Array<T> lookup<T, int     >(const Array<T> &input, const Array<int     > &indices, const unsigned dim); \
     template Array<T> lookup<T, unsigned>(const Array<T> &input, const Array<unsigned> &indices, const unsigned dim); \
+    template Array<T> lookup<T, short   >(const Array<T> &input, const Array<short   > &indices, const unsigned dim); \
+    template Array<T> lookup<T, ushort  >(const Array<T> &input, const Array<ushort  > &indices, const unsigned dim); \
+    template Array<T> lookup<T, intl    >(const Array<T> &input, const Array<intl    > &indices, const unsigned dim); \
+    template Array<T> lookup<T, uintl   >(const Array<T> &input, const Array<uintl   > &indices, const unsigned dim); \
     template Array<T> lookup<T, uchar   >(const Array<T> &input, const Array<uchar   > &indices, const unsigned dim);
 
 INSTANTIATE(float   );
@@ -56,5 +60,7 @@ INSTANTIATE(intl    );
 INSTANTIATE(uintl   );
 INSTANTIATE(uchar   );
 INSTANTIATE(char    );
+INSTANTIATE(ushort  );
+INSTANTIATE(short   );
 
 }
diff --git a/src/backend/opencl/match_template.cpp b/src/backend/opencl/match_template.cpp
index c6e82de681..3d0841025b 100644
--- a/src/backend/opencl/match_template.cpp
+++ b/src/backend/opencl/match_template.cpp
@@ -54,5 +54,7 @@ INSTANTIATE(char  ,  float)
 INSTANTIATE(int   ,  float)
 INSTANTIATE(uint  ,  float)
 INSTANTIATE(uchar ,  float)
+INSTANTIATE(short ,  float)
+INSTANTIATE(ushort,  float)
 
 }
diff --git a/src/backend/opencl/max.cpp b/src/backend/opencl/max.cpp
index d3bee0e23d..2ac2ed2833 100644
--- a/src/backend/opencl/max.cpp
+++ b/src/backend/opencl/max.cpp
@@ -22,4 +22,6 @@ namespace opencl
     INSTANTIATE(af_max_t, uintl  , uintl  )
     INSTANTIATE(af_max_t, char   , char   )
     INSTANTIATE(af_max_t, uchar  , uchar  )
+    INSTANTIATE(af_max_t, short  , short  )
+    INSTANTIATE(af_max_t, ushort , ushort )
 }
diff --git a/src/backend/opencl/meanshift.cpp b/src/backend/opencl/meanshift.cpp
index ea1b3bea54..ab884d42e4 100644
--- a/src/backend/opencl/meanshift.cpp
+++ b/src/backend/opencl/meanshift.cpp
@@ -39,5 +39,9 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
+INSTANTIATE(intl  )
+INSTANTIATE(uintl )
 
 }
diff --git a/src/backend/opencl/medfilt.cpp b/src/backend/opencl/medfilt.cpp
index 76fde1a34b..410dbb30af 100644
--- a/src/backend/opencl/medfilt.cpp
+++ b/src/backend/opencl/medfilt.cpp
@@ -51,5 +51,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp
index a545a3f645..f4c740482e 100644
--- a/src/backend/opencl/memory.cpp
+++ b/src/backend/opencl/memory.cpp
@@ -359,4 +359,6 @@ namespace opencl
     INSTANTIATE(uchar)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/min.cpp b/src/backend/opencl/min.cpp
index 9962fdb36d..3dd770264f 100644
--- a/src/backend/opencl/min.cpp
+++ b/src/backend/opencl/min.cpp
@@ -22,4 +22,6 @@ namespace opencl
     INSTANTIATE(af_min_t, uintl  , uintl  )
     INSTANTIATE(af_min_t, char   , char   )
     INSTANTIATE(af_min_t, uchar  , uchar  )
+    INSTANTIATE(af_min_t, short  , short  )
+    INSTANTIATE(af_min_t, ushort , ushort )
 }
diff --git a/src/backend/opencl/nearest_neighbour.cpp b/src/backend/opencl/nearest_neighbour.cpp
index 11fc832b9c..58be9678ce 100644
--- a/src/backend/opencl/nearest_neighbour.cpp
+++ b/src/backend/opencl/nearest_neighbour.cpp
@@ -105,6 +105,8 @@ INSTANTIATE(int   , int)
 INSTANTIATE(uint  , uint)
 INSTANTIATE(intl  , intl)
 INSTANTIATE(uintl , uintl)
+INSTANTIATE(short , int)
+INSTANTIATE(ushort, uint)
 INSTANTIATE(uchar , uint)
 
 INSTANTIATE(uintl, uint)    // For Hamming
diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp
index 8d133545a7..85364c4297 100644
--- a/src/backend/opencl/platform.cpp
+++ b/src/backend/opencl/platform.cpp
@@ -74,6 +74,11 @@ static const std::string get_system(void)
 #endif
 }
 
+int getBackend()
+{
+    return AF_BACKEND_OPENCL;
+}
+
 DeviceManager& DeviceManager::getInstance()
 {
     static DeviceManager my_instance;
@@ -277,6 +282,18 @@ int getActiveDeviceId()
     return DeviceManager::getInstance().mActiveQId;
 }
 
+int getDeviceIdFromNativeId(cl_device_id id)
+{
+    DeviceManager& devMngr = DeviceManager::getInstance();
+    int nDevices = devMngr.mDevices.size();
+    int devId = 0;
+    for (devId=0; devId<nDevices; ++devId) {
+        if (id == devMngr.mDevices[devId]->operator()())
+            break;
+    }
+    return devId;
+}
+
 const Context& getContext()
 {
     DeviceManager& devMngr = DeviceManager::getInstance();
@@ -484,3 +501,9 @@ af_err afcl_get_device_id(cl_device_id *id)
     *id = getDevice()();
     return AF_SUCCESS;
 }
+
+af_err afcl_set_device_id(cl_device_id id)
+{
+    setDevice(getDeviceIdFromNativeId(id));
+    return AF_SUCCESS;
+}
diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp
index 8d3a1d00a9..90f57aed39 100644
--- a/src/backend/opencl/platform.hpp
+++ b/src/backend/opencl/platform.hpp
@@ -27,6 +27,8 @@ class DeviceManager
 
     friend int getActiveDeviceId();
 
+    friend int getDeviceIdFromNativeId(cl_device_id id);
+
     friend const cl::Context& getContext();
 
     friend cl::CommandQueue& getQueue();
@@ -76,6 +78,8 @@ class DeviceManager
         unsigned mActiveQId;
 };
 
+int getBackend();
+
 std::string getInfo();
 
 int getDeviceCount();
diff --git a/src/backend/opencl/plot.cpp b/src/backend/opencl/plot.cpp
index 5a5712b86a..4eb240f3e9 100644
--- a/src/backend/opencl/plot.cpp
+++ b/src/backend/opencl/plot.cpp
@@ -66,6 +66,8 @@ INSTANTIATE(float)
 INSTANTIATE(double)
 INSTANTIATE(int)
 INSTANTIATE(uint)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 INSTANTIATE(uchar)
 
 }
diff --git a/src/backend/opencl/plot.hpp b/src/backend/opencl/plot.hpp
index 582d02e046..c195694869 100644
--- a/src/backend/opencl/plot.hpp
+++ b/src/backend/opencl/plot.hpp
@@ -20,4 +20,3 @@ namespace opencl
 
 #endif
 
-
diff --git a/src/backend/opencl/plot3.cpp b/src/backend/opencl/plot3.cpp
new file mode 100644
index 0000000000..ce3355d63c
--- /dev/null
+++ b/src/backend/opencl/plot3.cpp
@@ -0,0 +1,72 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <interopManager.hpp>
+#include <Array.hpp>
+#include <plot3.hpp>
+#include <err_opencl.hpp>
+#include <debug_opencl.hpp>
+
+using af::dim4;
+
+namespace opencl
+{
+
+template<typename T>
+void copy_plot3(const Array<T> &P, fg::Plot3* plot3)
+{
+    if (isGLSharingSupported()) {
+        CheckGL("Begin OpenCL resource copy");
+        const cl::Buffer *d_P = P.get();
+        size_t bytes = plot3->size();
+
+        InteropManager& intrpMngr = InteropManager::getInstance();
+
+        cl::Buffer *clPBOResource = intrpMngr.getBufferResource(plot3);
+
+        std::vector<cl::Memory> shared_objects;
+        shared_objects.push_back(*clPBOResource);
+
+        glFinish();
+        getQueue().enqueueAcquireGLObjects(&shared_objects);
+        getQueue().enqueueCopyBuffer(*d_P, *clPBOResource, 0, 0, bytes, NULL, NULL);
+        getQueue().finish();
+        getQueue().enqueueReleaseGLObjects(&shared_objects);
+
+        CL_DEBUG_FINISH(getQueue());
+        CheckGL("End OpenCL resource copy");
+    } else {
+        CheckGL("Begin OpenCL fallback-resource copy");
+        glBindBuffer(GL_ARRAY_BUFFER, plot3->vbo());
+        GLubyte* ptr = (GLubyte*)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+        if (ptr) {
+            getQueue().enqueueReadBuffer(*P.get(), CL_TRUE, 0, plot3->size(), ptr);
+            glUnmapBuffer(GL_ARRAY_BUFFER);
+        }
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
+        CheckGL("End OpenCL fallback-resource copy");
+    }
+}
+
+#define INSTANTIATE(T)  \
+    template void copy_plot3<T>(const Array<T> &P, fg::Plot3* plot3);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(uchar)
+
+}
+
+#endif  // WITH_GRAPHICS
diff --git a/src/backend/opencl/plot3.hpp b/src/backend/opencl/plot3.hpp
new file mode 100644
index 0000000000..86093908a6
--- /dev/null
+++ b/src/backend/opencl/plot3.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <Array.hpp>
+#include <graphics_common.hpp>
+
+namespace opencl
+{
+    template<typename T>
+    void copy_plot3(const Array<T> &P, fg::Plot3* plot3);
+}
+
+#endif
+
diff --git a/src/backend/opencl/product.cpp b/src/backend/opencl/product.cpp
index 3f32caeb41..d9019ba973 100644
--- a/src/backend/opencl/product.cpp
+++ b/src/backend/opencl/product.cpp
@@ -22,4 +22,6 @@ namespace opencl
     INSTANTIATE(af_mul_t, uintl  , uintl  )
     INSTANTIATE(af_mul_t, char   , int    )
     INSTANTIATE(af_mul_t, uchar  , uint   )
+    INSTANTIATE(af_mul_t, short  , int    )
+    INSTANTIATE(af_mul_t, ushort , uint   )
 }
diff --git a/src/backend/opencl/random.cpp b/src/backend/opencl/random.cpp
index a6f2b6731e..3d98fc6698 100644
--- a/src/backend/opencl/random.cpp
+++ b/src/backend/opencl/random.cpp
@@ -41,6 +41,8 @@ namespace opencl
     template Array<uint>   randu<uint>    (const af::dim4 &dims);
     template Array<intl>   randu<intl>    (const af::dim4 &dims);
     template Array<uintl>  randu<uintl>   (const af::dim4 &dims);
+    template Array<short>  randu<short>   (const af::dim4 &dims);
+    template Array<ushort> randu<ushort>  (const af::dim4 &dims);
     template Array<char>   randu<char>    (const af::dim4 &dims);
     template Array<uchar>  randu<uchar>   (const af::dim4 &dims);
 
diff --git a/src/backend/opencl/range.cpp b/src/backend/opencl/range.cpp
index faeb4fa80e..61bba9c613 100644
--- a/src/backend/opencl/range.cpp
+++ b/src/backend/opencl/range.cpp
@@ -45,4 +45,6 @@ namespace opencl
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/regions.cpp b/src/backend/opencl/regions.cpp
index 0ca6a083c9..001a0002cf 100644
--- a/src/backend/opencl/regions.cpp
+++ b/src/backend/opencl/regions.cpp
@@ -48,5 +48,7 @@ INSTANTIATE(float )
 INSTANTIATE(double)
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/opencl/reorder.cpp b/src/backend/opencl/reorder.cpp
index 403f612910..c10472df75 100644
--- a/src/backend/opencl/reorder.cpp
+++ b/src/backend/opencl/reorder.cpp
@@ -43,4 +43,6 @@ namespace opencl
     INSTANTIATE(char)
     INSTANTIATE(intl)
     INSTANTIATE(uintl)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/resize.cpp b/src/backend/opencl/resize.cpp
index 10f27356ca..051d9554db 100644
--- a/src/backend/opencl/resize.cpp
+++ b/src/backend/opencl/resize.cpp
@@ -58,4 +58,6 @@ namespace opencl
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/rotate.cpp b/src/backend/opencl/rotate.cpp
index b7888d00a8..404b79af91 100644
--- a/src/backend/opencl/rotate.cpp
+++ b/src/backend/opencl/rotate.cpp
@@ -54,4 +54,6 @@ namespace opencl
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/scan.cpp b/src/backend/opencl/scan.cpp
index 74375dad38..3ac929a537 100644
--- a/src/backend/opencl/scan.cpp
+++ b/src/backend/opencl/scan.cpp
@@ -56,5 +56,7 @@ namespace opencl
     INSTANTIATE(af_add_t, uintl  , uintl  )
     INSTANTIATE(af_add_t, char   , int    )
     INSTANTIATE(af_add_t, uchar  , uint   )
+    INSTANTIATE(af_add_t, short  , int    )
+    INSTANTIATE(af_add_t, ushort , uint   )
     INSTANTIATE(af_notzero_t, char  , uint)
 }
diff --git a/src/backend/opencl/select.cpp b/src/backend/opencl/select.cpp
index 5c9a5d0fba..7e7200167b 100644
--- a/src/backend/opencl/select.cpp
+++ b/src/backend/opencl/select.cpp
@@ -49,4 +49,6 @@ namespace opencl
     INSTANTIATE(uintl  )
     INSTANTIATE(char   )
     INSTANTIATE(uchar  )
+    INSTANTIATE(short  )
+    INSTANTIATE(ushort )
 }
diff --git a/src/backend/opencl/set.cpp b/src/backend/opencl/set.cpp
index 665ffdf105..5604ff4ad9 100644
--- a/src/backend/opencl/set.cpp
+++ b/src/backend/opencl/set.cpp
@@ -31,6 +31,16 @@ namespace opencl
 {
     using af::dim4;
 
+    using std::conditional;
+    using std::is_same;
+    template<typename T>
+    using ltype_t = typename conditional<is_same<T, intl>::value, cl_long, T>::type;
+
+    template<typename T>
+    using type_t = typename conditional<is_same<T, uintl>::value,
+                                        cl_ulong, ltype_t<T>
+                                       >::type;
+
     template<typename T>
     Array<T> setUnique(const Array<T> &in,
                        const bool is_sorted)
@@ -42,8 +52,8 @@ namespace opencl
 
             compute::buffer out_data((*out.get())());
 
-            compute::buffer_iterator<T> begin(out_data, 0);
-            compute::buffer_iterator<T> end(out_data, out.dims()[0]);
+            compute::buffer_iterator< type_t<T> > begin(out_data, 0);
+            compute::buffer_iterator< type_t<T> > end(out_data, out.dims()[0]);
 
             if (!is_sorted) {
                 compute::sort(begin, end, queue);
@@ -82,13 +92,13 @@ namespace opencl
             compute::buffer second_data((*unique_second.get())());
             compute::buffer out_data((*out.get())());
 
-            compute::buffer_iterator<T> first_begin(first_data, 0);
-            compute::buffer_iterator<T> first_end(first_data, unique_first.dims()[0]);
-            compute::buffer_iterator<T> second_begin(second_data, 0);
-            compute::buffer_iterator<T> second_end(second_data, unique_second.dims()[0]);
-            compute::buffer_iterator<T> out_begin(out_data, 0);
+            compute::buffer_iterator< type_t<T> > first_begin(first_data, 0);
+            compute::buffer_iterator< type_t<T> > first_end(first_data, unique_first.dims()[0]);
+            compute::buffer_iterator< type_t<T> > second_begin(second_data, 0);
+            compute::buffer_iterator< type_t<T> > second_end(second_data, unique_second.dims()[0]);
+            compute::buffer_iterator< type_t<T> > out_begin(out_data, 0);
 
-            compute::buffer_iterator<T> out_end = compute::set_union(
+            compute::buffer_iterator< type_t<T> > out_end = compute::set_union(
                 first_begin, first_end, second_begin, second_end, out_begin, queue
                 );
 
@@ -123,13 +133,13 @@ namespace opencl
             compute::buffer second_data((*unique_second.get())());
             compute::buffer out_data((*out.get())());
 
-            compute::buffer_iterator<T> first_begin(first_data, 0);
-            compute::buffer_iterator<T> first_end(first_data, unique_first.dims()[0]);
-            compute::buffer_iterator<T> second_begin(second_data, 0);
-            compute::buffer_iterator<T> second_end(second_data, unique_second.dims()[0]);
-            compute::buffer_iterator<T> out_begin(out_data, 0);
+            compute::buffer_iterator< type_t<T> > first_begin(first_data, 0);
+            compute::buffer_iterator< type_t<T> > first_end(first_data, unique_first.dims()[0]);
+            compute::buffer_iterator< type_t<T> > second_begin(second_data, 0);
+            compute::buffer_iterator< type_t<T> > second_end(second_data, unique_second.dims()[0]);
+            compute::buffer_iterator< type_t<T> > out_begin(out_data, 0);
 
-            compute::buffer_iterator<T> out_end = compute::set_intersection(
+            compute::buffer_iterator< type_t<T> > out_end = compute::set_intersection(
                 first_begin, first_end, second_begin, second_end, out_begin, queue
                 );
 
@@ -151,6 +161,10 @@ namespace opencl
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 }
 
 #pragma GCC diagnostic pop
diff --git a/src/backend/opencl/shift.cpp b/src/backend/opencl/shift.cpp
index 5cbb71dcd2..61cbee9b75 100644
--- a/src/backend/opencl/shift.cpp
+++ b/src/backend/opencl/shift.cpp
@@ -41,4 +41,6 @@ namespace opencl
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/sift.cpp b/src/backend/opencl/sift.cpp
index 7e5aa6d838..5bd940d127 100644
--- a/src/backend/opencl/sift.cpp
+++ b/src/backend/opencl/sift.cpp
@@ -31,7 +31,8 @@ unsigned sift(Array<float>& x_out, Array<float>& y_out, Array<float>& score_out,
               const Array<T>& in, const unsigned n_layers,
               const float contrast_thr, const float edge_thr,
               const float init_sigma, const bool double_input,
-              const float img_scale, const float feature_ratio)
+              const float img_scale, const float feature_ratio,
+              const bool compute_GLOH)
 {
 #ifdef AF_BUILD_SIFT
     unsigned nfeat_out;
@@ -46,7 +47,7 @@ unsigned sift(Array<float>& x_out, Array<float>& y_out, Array<float>& score_out,
 
     kernel::sift<T,convAccT>(&nfeat_out, &desc_len, x, y, score, ori, size, desc,
                              in, n_layers, contrast_thr, edge_thr, init_sigma,
-                             double_input, img_scale, feature_ratio);
+                             double_input, img_scale, feature_ratio, compute_GLOH);
 
     if (nfeat_out > 0) {
         const dim4 out_dims(nfeat_out);
@@ -62,19 +63,23 @@ unsigned sift(Array<float>& x_out, Array<float>& y_out, Array<float>& score_out,
 
     return nfeat_out;
 #else
-    AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AFF_ERR_NONFREE);
+    if (compute_GLOH)
+        AF_ERROR("ArrayFire was not built with nonfree support, GLOH disabled\n", AF_ERR_NONFREE);
+    else
+        AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AF_ERR_NONFREE);
 #endif
 }
 
 
-#define INSTANTIATE(T, convAccT)                                        \
-    template unsigned sift<T, convAccT>(Array<float>& x_out, Array<float>& y_out, \
-                                        Array<float>& score_out, Array<float>& ori_out, \
-                                        Array<float>& size_out, Array<float>& desc_out, \
-                                        const Array<T>& in, const unsigned n_layers, \
-                                        const float contrast_thr, const float edge_thr, \
-                                        const float init_sigma, const bool double_input, \
-                                        const float img_scale, const float feature_ratio);
+#define INSTANTIATE(T, convAccT)                                                            \
+    template unsigned sift<T, convAccT>(Array<float>& x_out, Array<float>& y_out,           \
+                                        Array<float>& score_out, Array<float>& ori_out,     \
+                                        Array<float>& size_out, Array<float>& desc_out,     \
+                                        const Array<T>& in, const unsigned n_layers,        \
+                                        const float contrast_thr, const float edge_thr,     \
+                                        const float init_sigma, const bool double_input,    \
+                                        const float img_scale, const float feature_ratio,   \
+                                        const bool compute_GLOH);
 
 INSTANTIATE(float , float )
 INSTANTIATE(double, double)
diff --git a/src/backend/opencl/sift.hpp b/src/backend/opencl/sift.hpp
index 96b422f14f..1587fc9655 100644
--- a/src/backend/opencl/sift.hpp
+++ b/src/backend/opencl/sift.hpp
@@ -21,6 +21,7 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
               const Array<T>& in, const unsigned n_layers,
               const float contrast_thr, const float edge_thr,
               const float init_sigma, const bool double_input,
-              const float img_scale, const float feature_ratio);
+              const float img_scale, const float feature_ratio,
+              const bool compute_GLOH);
 
 }
diff --git a/src/backend/opencl/sobel.cpp b/src/backend/opencl/sobel.cpp
index a8c76f9636..7acb007156 100644
--- a/src/backend/opencl/sobel.cpp
+++ b/src/backend/opencl/sobel.cpp
@@ -44,5 +44,7 @@ INSTANTIATE(int   , int)
 INSTANTIATE(uint  , int)
 INSTANTIATE(char  , int)
 INSTANTIATE(uchar , int)
+INSTANTIATE(short , int)
+INSTANTIATE(ushort, int)
 
 }
diff --git a/src/backend/opencl/sort.cpp b/src/backend/opencl/sort.cpp
index 33c4f83257..762d815095 100644
--- a/src/backend/opencl/sort.cpp
+++ b/src/backend/opencl/sort.cpp
@@ -43,5 +43,9 @@ namespace opencl
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 
 }
diff --git a/src/backend/opencl/sort_by_key/impl.hpp b/src/backend/opencl/sort_by_key/impl.hpp
index 73bcaf2e88..49d184113f 100644
--- a/src/backend/opencl/sort_by_key/impl.hpp
+++ b/src/backend/opencl/sort_by_key/impl.hpp
@@ -49,5 +49,9 @@ namespace opencl
     INSTANTIATE(Tk, uint  , isAscending)        \
     INSTANTIATE(Tk, char  , isAscending)        \
     INSTANTIATE(Tk, uchar , isAscending)        \
+    INSTANTIATE(Tk, short , isAscending)        \
+    INSTANTIATE(Tk, ushort, isAscending)        \
+    INSTANTIATE(Tk, intl  , isAscending)        \
+    INSTANTIATE(Tk, uintl , isAscending)        \
 
 }
diff --git a/src/backend/opencl/sort_by_key/s16.cpp b/src/backend/opencl/sort_by_key/s16.cpp
new file mode 100644
index 0000000000..44e17b5030
--- /dev/null
+++ b/src/backend/opencl/sort_by_key/s16.cpp
@@ -0,0 +1,16 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include "impl.hpp"
+
+namespace opencl
+{
+    INSTANTIATE1(short,true)
+    INSTANTIATE1(short,false)
+}
diff --git a/src/backend/opencl/sort_by_key/s64.cpp b/src/backend/opencl/sort_by_key/s64.cpp
new file mode 100644
index 0000000000..e2ed8d687b
--- /dev/null
+++ b/src/backend/opencl/sort_by_key/s64.cpp
@@ -0,0 +1,16 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include "impl.hpp"
+
+namespace opencl
+{
+    INSTANTIATE1(intl,true)
+    INSTANTIATE1(intl,false)
+}
diff --git a/src/backend/opencl/sort_by_key/u16.cpp b/src/backend/opencl/sort_by_key/u16.cpp
new file mode 100644
index 0000000000..c53b68fb53
--- /dev/null
+++ b/src/backend/opencl/sort_by_key/u16.cpp
@@ -0,0 +1,16 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include "impl.hpp"
+
+namespace opencl
+{
+    INSTANTIATE1(ushort,true)
+    INSTANTIATE1(ushort,false)
+}
diff --git a/src/backend/opencl/sort_by_key/u64.cpp b/src/backend/opencl/sort_by_key/u64.cpp
new file mode 100644
index 0000000000..89649b1ba5
--- /dev/null
+++ b/src/backend/opencl/sort_by_key/u64.cpp
@@ -0,0 +1,16 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include "impl.hpp"
+
+namespace opencl
+{
+    INSTANTIATE1(uintl,true)
+    INSTANTIATE1(uintl,false)
+}
diff --git a/src/backend/opencl/sort_index.cpp b/src/backend/opencl/sort_index.cpp
index ebbd9f543c..c7aaa70feb 100644
--- a/src/backend/opencl/sort_index.cpp
+++ b/src/backend/opencl/sort_index.cpp
@@ -45,5 +45,9 @@ namespace opencl
     INSTANTIATE(uint)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
+    INSTANTIATE(intl)
+    INSTANTIATE(uintl)
 
 }
diff --git a/src/backend/opencl/sum.cpp b/src/backend/opencl/sum.cpp
index cbe3c5f492..9ae378fd6e 100644
--- a/src/backend/opencl/sum.cpp
+++ b/src/backend/opencl/sum.cpp
@@ -17,9 +17,19 @@ namespace opencl
     INSTANTIATE(af_add_t, cfloat , cfloat )
     INSTANTIATE(af_add_t, cdouble, cdouble)
     INSTANTIATE(af_add_t, int    , int    )
+    INSTANTIATE(af_add_t, int    , float  )
     INSTANTIATE(af_add_t, uint   , uint   )
+    INSTANTIATE(af_add_t, uint   , float  )
     INSTANTIATE(af_add_t, intl   , intl   )
+    INSTANTIATE(af_add_t, intl   , double )
     INSTANTIATE(af_add_t, uintl  , uintl  )
+    INSTANTIATE(af_add_t, uintl  , double )
     INSTANTIATE(af_add_t, char   , int    )
+    INSTANTIATE(af_add_t, char   , float  )
     INSTANTIATE(af_add_t, uchar  , uint   )
+    INSTANTIATE(af_add_t, uchar  , float  )
+    INSTANTIATE(af_add_t, short  , int    )
+    INSTANTIATE(af_add_t, short  , float  )
+    INSTANTIATE(af_add_t, ushort , uint   )
+    INSTANTIATE(af_add_t, ushort , float  )
 }
diff --git a/src/backend/opencl/surface.cpp b/src/backend/opencl/surface.cpp
new file mode 100644
index 0000000000..8116941a77
--- /dev/null
+++ b/src/backend/opencl/surface.cpp
@@ -0,0 +1,75 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <interopManager.hpp>
+#include <Array.hpp>
+#include <surface.hpp>
+#include <err_opencl.hpp>
+#include <debug_opencl.hpp>
+#include <join.hpp>
+#include <reduce.hpp>
+#include <reorder.hpp>
+
+using af::dim4;
+
+namespace opencl
+{
+
+template<typename T>
+void copy_surface(const Array<T> &P, fg::Surface* surface)
+{
+    if (isGLSharingSupported()) {
+        CheckGL("Begin OpenCL resource copy");
+        const cl::Buffer *d_P = P.get();
+        size_t bytes = surface->size();
+
+        InteropManager& intrpMngr = InteropManager::getInstance();
+
+        cl::Buffer *clPBOResource = intrpMngr.getBufferResource(surface);
+
+        std::vector<cl::Memory> shared_objects;
+        shared_objects.push_back(*clPBOResource);
+
+        glFinish();
+        getQueue().enqueueAcquireGLObjects(&shared_objects);
+        getQueue().enqueueCopyBuffer(*d_P, *clPBOResource, 0, 0, bytes, NULL, NULL);
+        getQueue().finish();
+        getQueue().enqueueReleaseGLObjects(&shared_objects);
+
+        CL_DEBUG_FINISH(getQueue());
+        CheckGL("End OpenCL resource copy");
+    } else {
+        CheckGL("Begin OpenCL fallback-resource copy");
+        glBindBuffer(GL_ARRAY_BUFFER, surface->vbo());
+        GLubyte* ptr = (GLubyte*)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+        if (ptr) {
+            getQueue().enqueueReadBuffer(*P.get(), CL_TRUE, 0, surface->size(), ptr);
+            glUnmapBuffer(GL_ARRAY_BUFFER);
+        }
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
+        CheckGL("End OpenCL fallback-resource copy");
+    }
+}
+
+#define INSTANTIATE(T)  \
+    template void copy_surface<T>(const Array<T> &P, fg::Surface* surface);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(uchar)
+
+}
+
+#endif  // WITH_GRAPHICS
diff --git a/src/backend/opencl/surface.hpp b/src/backend/opencl/surface.hpp
new file mode 100644
index 0000000000..15079f0159
--- /dev/null
+++ b/src/backend/opencl/surface.hpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#if defined (WITH_GRAPHICS)
+
+#include <Array.hpp>
+#include <graphics_common.hpp>
+
+namespace opencl
+{
+    template<typename T>
+    void copy_surface(const Array<T> &P, fg::Surface* surface);
+}
+
+#endif
+
+
diff --git a/src/backend/opencl/susan.cpp b/src/backend/opencl/susan.cpp
index 5e9b1a6b8e..b390566194 100644
--- a/src/backend/opencl/susan.cpp
+++ b/src/backend/opencl/susan.cpp
@@ -51,14 +51,20 @@ unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
     bufferFree(resp);
 
     const unsigned corners_out = std::min(corners_found, corner_lim);
-    if (corners_out == 0)
+    if (corners_out == 0) {
+        bufferFree(x_corners);
+        bufferFree(y_corners);
+        bufferFree(resp_corners);
+        x_out    = createEmptyArray<float>(dim4());
+        y_out    = createEmptyArray<float>(dim4());
+        resp_out = createEmptyArray<float>(dim4());
         return 0;
-
-    x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*x_corners)()));
-    y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*y_corners)()));
-    resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*resp_corners)()));
-
-    return corners_out;
+    } else {
+        x_out    = createDeviceDataArray<float>(dim4(corners_out), (void*)((*x_corners)()));
+        y_out    = createDeviceDataArray<float>(dim4(corners_out), (void*)((*y_corners)()));
+        resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*resp_corners)()));
+        return corners_out;
+    }
 }
 
 #define INSTANTIATE(T) \
@@ -72,5 +78,7 @@ INSTANTIATE(char  )
 INSTANTIATE(int   )
 INSTANTIATE(uint  )
 INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/opencl/tile.cpp b/src/backend/opencl/tile.cpp
index 794059d7aa..38902ad44d 100644
--- a/src/backend/opencl/tile.cpp
+++ b/src/backend/opencl/tile.cpp
@@ -41,5 +41,7 @@ namespace opencl
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/opencl/transform.cpp b/src/backend/opencl/transform.cpp
index 34bfead6e5..c8e2b69a8b 100644
--- a/src/backend/opencl/transform.cpp
+++ b/src/backend/opencl/transform.cpp
@@ -80,4 +80,6 @@ namespace opencl
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/transpose.cpp b/src/backend/opencl/transpose.cpp
index 43a1da9df3..cbc2345ccd 100644
--- a/src/backend/opencl/transpose.cpp
+++ b/src/backend/opencl/transpose.cpp
@@ -52,5 +52,7 @@ INSTANTIATE(uint   )
 INSTANTIATE(uchar  )
 INSTANTIATE(intl   )
 INSTANTIATE(uintl  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/opencl/transpose_inplace.cpp b/src/backend/opencl/transpose_inplace.cpp
index c30ff2e058..0cf758e64a 100644
--- a/src/backend/opencl/transpose_inplace.cpp
+++ b/src/backend/opencl/transpose_inplace.cpp
@@ -48,5 +48,7 @@ INSTANTIATE(uint   )
 INSTANTIATE(uchar  )
 INSTANTIATE(intl   )
 INSTANTIATE(uintl  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/opencl/triangle.cpp b/src/backend/opencl/triangle.cpp
index 371aead83c..0dd6357e08 100644
--- a/src/backend/opencl/triangle.cpp
+++ b/src/backend/opencl/triangle.cpp
@@ -53,5 +53,7 @@ Array<T> triangle(const Array<T> &in)
     INSTANTIATE(uintl)
     INSTANTIATE(char)
     INSTANTIATE(uchar)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/opencl/types.cpp b/src/backend/opencl/types.cpp
index df8e76a78c..6581b047db 100644
--- a/src/backend/opencl/types.cpp
+++ b/src/backend/opencl/types.cpp
@@ -25,5 +25,7 @@ namespace opencl
     template<> const char *shortname<uchar   >(bool caps) { return caps ? "V" : "v"; }
     template<> const char *shortname<intl    >(bool caps) { return caps ? "L" : "l"; }
     template<> const char *shortname<uintl   >(bool caps) { return caps ? "K" : "k"; }
+    template<> const char *shortname<short   >(bool caps) { return caps ? "P" : "p"; }
+    template<> const char *shortname<ushort  >(bool caps) { return caps ? "Q" : "q"; }
 
 }
diff --git a/src/backend/opencl/types.hpp b/src/backend/opencl/types.hpp
index 69f5030646..f0ed13382c 100644
--- a/src/backend/opencl/types.hpp
+++ b/src/backend/opencl/types.hpp
@@ -21,6 +21,7 @@ namespace opencl
     typedef cl_double2 cdouble;
     typedef cl_uchar     uchar;
     typedef cl_uint       uint;
+    typedef cl_ushort   ushort;
 
     template<typename T> struct is_complex          { static const bool value = false;  };
     template<> struct           is_complex<cfloat>  { static const bool value = true;   };
diff --git a/src/backend/opencl/unwrap.cpp b/src/backend/opencl/unwrap.cpp
index 4fc91a7c0c..845b341699 100644
--- a/src/backend/opencl/unwrap.cpp
+++ b/src/backend/opencl/unwrap.cpp
@@ -53,4 +53,6 @@ namespace opencl
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/src/backend/opencl/where.cpp b/src/backend/opencl/where.cpp
index 1ce82bf717..19bc7cf1bc 100644
--- a/src/backend/opencl/where.cpp
+++ b/src/backend/opencl/where.cpp
@@ -41,5 +41,7 @@ namespace opencl
     INSTANTIATE(intl   )
     INSTANTIATE(uintl  )
     INSTANTIATE(uchar  )
+    INSTANTIATE(short  )
+    INSTANTIATE(ushort )
 
 }
diff --git a/src/backend/opencl/wrap.cpp b/src/backend/opencl/wrap.cpp
index f3a5e1bd5d..90849fc0f7 100644
--- a/src/backend/opencl/wrap.cpp
+++ b/src/backend/opencl/wrap.cpp
@@ -54,4 +54,6 @@ namespace opencl
     INSTANTIATE(uintl)
     INSTANTIATE(uchar)
     INSTANTIATE(char)
+    INSTANTIATE(short)
+    INSTANTIATE(ushort)
 }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 662a74dee5..30907d3390 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -5,22 +5,24 @@ FIND_PACKAGE(CUDA QUIET)
 FIND_PACKAGE(OpenCL QUIET)
 
 
-MACRO(CREATE_TESTS BACKEND GTEST_LIBS OTHER_LIBS)
+MACRO(CREATE_TESTS BACKEND LIBNAME GTEST_LIBS OTHER_LIBS)
     STRING(TOUPPER ${BACKEND} DEF_NAME)
 
     FOREACH(FILE ${FILES})
         GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE)
         SET(TEST_NAME ${FNAME}_${BACKEND})
 
-        IF ("${FILE}" MATCHES ".manual." OR "${FILE}" MATCHES ".nonfree.")
-          MESSAGE(STATUS "Removing ${FILE} from ctest")
+        IF(NOT ${BUILD_NONFREE} AND "${FILE}" MATCHES ".nonfree.")
+            MESSAGE(STATUS "Removing ${FILE} from ctest")
+        ELSEIF("${FILE}" MATCHES ".manual.")
+            MESSAGE(STATUS "Removing ${FILE} from ctest")
         ELSE()
           ADD_TEST(Test_${TEST_NAME} ${TEST_NAME})
         ENDIF()
 
         FILE(GLOB TEST_FILE "${FNAME}.cpp" "${FNAME}.c")
         ADD_EXECUTABLE(${TEST_NAME} ${TEST_FILE})
-        TARGET_LINK_LIBRARIES(${TEST_NAME}  PRIVATE  af${BACKEND}
+        TARGET_LINK_LIBRARIES(${TEST_NAME}  PRIVATE  af${LIBNAME}
                                             PRIVATE ${THREAD_LIB_FLAG}
                                             PRIVATE ${GTEST_LIBS}
                                             PRIVATE ${OTHER_LIBS})
@@ -40,15 +42,6 @@ ELSE()
     SET(THREAD_LIB_FLAG ${CMAKE_THREAD_LIBS_INIT})
 ENDIF()
 
-OPTION(USE_SYSTEM_GTEST "Use GTEST from system libraries" OFF)
-IF(USE_SYSTEM_GTEST)
-    FIND_PACKAGE(GTest REQUIRED)
-ELSE(USE_SYSTEM_GTEST)
-    INCLUDE("${CMAKE_MODULE_PATH}/build_gtest.cmake")
-ENDIF(USE_SYSTEM_GTEST)
-
-INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS})
-
 OPTION(USE_RELATIVE_TEST_DIR "Use relative paths for the test data directory(For continious integration(CI) purposes only)" OFF)
 
 IF(${USE_RELATIVE_TEST_DIR})
@@ -72,21 +65,30 @@ IF (EXISTS "${TESTDATA_SOURCE_DIR}" AND IS_DIRECTORY "${TESTDATA_SOURCE_DIR}"
     # Do Nothing
 ELSE (EXISTS "${TESTDATA_SOURCE_DIR}" AND IS_DIRECTORY "${TESTDATA_SOURCE_DIR}"
     AND EXISTS "${TESTDATA_SOURCE_DIR}/README.md")
-    MESSAGE(WARNING "Test Data is not available. Tests will build but fail when run.")
-    MESSAGE("Did you miss the --recursive option when cloning?")
-    MESSAGE("Run the following commands to correct this:")
-    MESSAGE("git submodule init")
-    MESSAGE("git submodule update")
-    MESSAGE("git submodule foreach git pull origin master")
+    MESSAGE(STATUS "Test submodules unavailable. Updating submodules.")
+    EXECUTE_PROCESS(
+        COMMAND git submodule update --init --recursive
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+        OUTPUT_QUIET
+    )
 ENDIF()
 ENDIF(NOT ${USE_RELATIVE_TEST_DIR})
 
+OPTION(USE_SYSTEM_GTEST "Use GTEST from system libraries" OFF)
+IF(USE_SYSTEM_GTEST)
+    FIND_PACKAGE(GTest REQUIRED)
+ELSE(USE_SYSTEM_GTEST)
+    INCLUDE("${CMAKE_MODULE_PATH}/build_gtest.cmake")
+ENDIF(USE_SYSTEM_GTEST)
+
+INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS})
+
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 FILE(GLOB FILES "*.cpp" "*.c")
 
 IF(${BUILD_CPU})
     MESSAGE(STATUS "TESTS: CPU backend is ON")
-    CREATE_TESTS(cpu "${GTEST_LIBRARIES}" "")
+    CREATE_TESTS(cpu cpu "${GTEST_LIBRARIES}" "")
 ELSE()
     MESSAGE(STATUS "TESTS: CPU backend is OFF")
 ENDIF()
@@ -94,7 +96,7 @@ ENDIF()
 IF(${BUILD_CUDA} AND ${CUDA_FOUND})
     MESSAGE(STATUS "TESTS: CUDA backend is ON")
     IF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        CREATE_TESTS(cuda "${GTEST_LIBRARIES_STDLIB}" "")
+        CREATE_TESTS(cuda cuda "${GTEST_LIBRARIES_STDLIB}" "")
         FOREACH(FILE ${FILES})
             GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE)
             SET(TEST_NAME ${FNAME}_cuda)
@@ -102,7 +104,7 @@ IF(${BUILD_CUDA} AND ${CUDA_FOUND})
             SET_TARGET_PROPERTIES(${TEST_NAME} PROPERTIES LINK_FLAGS -stdlib=libstdc++)
         ENDFOREACH()
     ELSE("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        CREATE_TESTS(cuda "${GTEST_LIBRARIES}" "")
+        CREATE_TESTS(cuda cuda "${GTEST_LIBRARIES}" "")
     ENDIF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7)
 ELSE()
     MESSAGE(STATUS "TESTS: CUDA backend is OFF")
@@ -110,7 +112,16 @@ ENDIF()
 
 IF(${BUILD_OPENCL} AND ${OpenCL_FOUND})
     MESSAGE(STATUS "TESTS: OPENCL backend is ON")
-    CREATE_TESTS(opencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}")
+    CREATE_TESTS(opencl opencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}")
 ELSE()
     MESSAGE(STATUS "TESTS: OPENCL backend is OFF")
 ENDIF()
+
+IF(${BUILD_UNIFIED})
+    MESSAGE(STATUS "TESTS: Unified backends is ON")
+    IF(WIN32)
+        CREATE_TESTS(unified "" "${GTEST_LIBRARIES}" "")
+    ELSE()
+        CREATE_TESTS(unified "" "${GTEST_LIBRARIES}" dl)
+    ENDIF()
+ENDIF()
diff --git a/test/approx1.cpp b/test/approx1.cpp
index ad6eb3a5a1..7a6b66fce8 100644
--- a/test/approx1.cpp
+++ b/test/approx1.cpp
@@ -233,3 +233,49 @@ TEST(Approx1, CPP)
 
 #undef BT
 }
+
+TEST(Approx1, CPPNearestBatch)
+{
+    if (noDoubleTests<float>()) return;
+
+    af::array input = af::randu(600, 10);
+    af::array pos   = input.dims(0) * af::randu(100, 10);
+
+    af::array outBatch = af::approx1(input, pos, AF_INTERP_NEAREST);
+
+    af::array outSerial(pos.dims());
+    for(int i = 0; i < pos.dims(1); i++) {
+        outSerial(af::span, i) = af::approx1(input(af::span, i), pos(af::span, i), AF_INTERP_NEAREST);
+    }
+
+    af::array outGFOR(pos.dims());
+    gfor(af::seq i, pos.dims(1)) {
+        outGFOR(af::span, i) = af::approx1(input(af::span, i), pos(af::span, i), AF_INTERP_NEAREST);
+    }
+
+    ASSERT_NEAR(0, af::sum<double>(af::abs(outBatch - outSerial)), 1e-3);
+    ASSERT_NEAR(0, af::sum<double>(af::abs(outBatch - outGFOR)), 1e-3);
+}
+
+TEST(Approx1, CPPLinearBatch)
+{
+    if (noDoubleTests<float>()) return;
+
+    af::array input = af::iota(af::dim4(10000, 20), c32);
+    af::array pos   = input.dims(0) * af::randu(50000, 20);
+
+    af::array outBatch = af::approx1(input, pos, AF_INTERP_LINEAR);
+
+    af::array outSerial(pos.dims());
+    for(int i = 0; i < pos.dims(1); i++) {
+        outSerial(af::span, i) = af::approx1(input(af::span, i), pos(af::span, i), AF_INTERP_LINEAR);
+    }
+
+    af::array outGFOR(pos.dims());
+    gfor(af::seq i, pos.dims(1)) {
+        outGFOR(af::span, i) = af::approx1(input(af::span, i), pos(af::span, i), AF_INTERP_LINEAR);
+    }
+
+    ASSERT_NEAR(0, af::sum<double>(af::abs(outBatch - outSerial)), 1e-3);
+    ASSERT_NEAR(0, af::sum<double>(af::abs(outBatch - outGFOR)), 1e-3);
+}
diff --git a/test/approx2.cpp b/test/approx2.cpp
index 9c748e2c61..f1a1accc51 100644
--- a/test/approx2.cpp
+++ b/test/approx2.cpp
@@ -248,3 +248,55 @@ TEST(Approx2, CPP)
 
 #undef BT
 }
+
+TEST(Approx2, CPPNearestBatch)
+{
+    if (noDoubleTests<float>()) return;
+
+    af::array input = af::randu(200, 100, 10);
+    af::array pos   = input.dims(0) * af::randu(100, 100, 10);
+    af::array qos   = input.dims(1) * af::randu(100, 100, 10);
+
+    af::array outBatch = af::approx2(input, pos, qos, AF_INTERP_NEAREST);
+
+    af::array outSerial(pos.dims());
+    for(int i = 0; i < pos.dims(2); i++) {
+        outSerial(af::span, af::span, i) = af::approx2(input(af::span, af::span, i),
+            pos(af::span, af::span, i), qos(af::span, af::span, i), AF_INTERP_NEAREST);
+    }
+
+    af::array outGFOR(pos.dims());
+    gfor(af::seq i, pos.dims(2)) {
+        outGFOR(af::span, af::span, i) = af::approx2(input(af::span, af::span, i),
+            pos(af::span, af::span, i), qos(af::span, af::span, i), AF_INTERP_NEAREST);
+    }
+
+    ASSERT_NEAR(0, af::sum<double>(af::abs(outBatch - outSerial)), 1e-3);
+    ASSERT_NEAR(0, af::sum<double>(af::abs(outBatch - outGFOR)), 1e-3);
+}
+
+TEST(Approx2, CPPLinearBatch)
+{
+    if (noDoubleTests<float>()) return;
+
+    af::array input = af::randu(200, 100, 10);
+    af::array pos   = input.dims(0) * af::randu(100, 100, 10);
+    af::array qos   = input.dims(1) * af::randu(100, 100, 10);
+
+    af::array outBatch = af::approx2(input, pos, qos, AF_INTERP_LINEAR);
+
+    af::array outSerial(pos.dims());
+    for(int i = 0; i < pos.dims(2); i++) {
+        outSerial(af::span, af::span, i) = af::approx2(input(af::span, af::span, i),
+            pos(af::span, af::span, i), qos(af::span, af::span, i), AF_INTERP_LINEAR);
+    }
+
+    af::array outGFOR(pos.dims());
+    gfor(af::seq i, pos.dims(2)) {
+        outGFOR(af::span, af::span, i) = af::approx2(input(af::span, af::span, i),
+            pos(af::span, af::span, i), qos(af::span, af::span, i), AF_INTERP_LINEAR);
+    }
+
+    ASSERT_NEAR(0, af::sum<double>(af::abs(outBatch - outSerial)), 1e-3);
+    ASSERT_NEAR(0, af::sum<double>(af::abs(outBatch - outGFOR)), 1e-3);
+}
diff --git a/test/array.cpp b/test/array.cpp
index 682bc5b343..e3cb6220cb 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -20,7 +20,7 @@ class Array : public ::testing::Test
 
 };
 
-typedef ::testing::Types<float, double, af::cfloat, af::cdouble, char, unsigned char, int, uint, intl, uintl> TestTypes;
+typedef ::testing::Types<float, double, af::cfloat, af::cdouble, char, unsigned char, int, uint, intl, uintl, short, ushort> TestTypes;
 TYPED_TEST_CASE(Array, TestTypes);
 
 TEST(Array, ConstructorDefault)
@@ -283,6 +283,26 @@ TYPED_TEST(Array, TypeAttributes)
             EXPECT_FALSE(one.iscomplex());
             EXPECT_FALSE(one.isbool());
             break;
+        case s16:
+            EXPECT_FALSE(one.isfloating());
+            EXPECT_FALSE(one.isdouble());
+            EXPECT_FALSE(one.issingle());
+            EXPECT_FALSE(one.isrealfloating());
+            EXPECT_TRUE(one.isinteger());
+            EXPECT_TRUE(one.isreal());
+            EXPECT_FALSE(one.iscomplex());
+            EXPECT_FALSE(one.isbool());
+            break;
+        case u16:
+            EXPECT_FALSE(one.isfloating());
+            EXPECT_FALSE(one.isdouble());
+            EXPECT_FALSE(one.issingle());
+            EXPECT_FALSE(one.isrealfloating());
+            EXPECT_TRUE(one.isinteger());
+            EXPECT_TRUE(one.isreal());
+            EXPECT_FALSE(one.iscomplex());
+            EXPECT_FALSE(one.isbool());
+            break;
         case u8:
             EXPECT_FALSE(one.isfloating());
             EXPECT_FALSE(one.isdouble());
diff --git a/test/assign.cpp b/test/assign.cpp
index 56923923c1..af68acdfd1 100644
--- a/test/assign.cpp
+++ b/test/assign.cpp
@@ -79,7 +79,7 @@ class ArrayAssign : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, af::cdouble, af::cfloat, double, int, uint, char, uchar, intl, uintl> TestTypes;
+typedef ::testing::Types<float, af::cdouble, af::cfloat, double, int, uint, char, uchar, intl, uintl, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(ArrayAssign, TestTypes);
diff --git a/test/bilateral.cpp b/test/bilateral.cpp
index c80d376b52..08b7a4c2b4 100644
--- a/test/bilateral.cpp
+++ b/test/bilateral.cpp
@@ -80,7 +80,7 @@ class BilateralOnData : public ::testing::Test
 {
 };
 
-typedef ::testing::Types<float, double, int, uint, char, uchar> DataTestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, short, ushort> DataTestTypes;
 
 // register the type list
 TYPED_TEST_CASE(BilateralOnData, DataTestTypes);
diff --git a/test/constant.cpp b/test/constant.cpp
index 8f6558261d..d3244a0566 100644
--- a/test/constant.cpp
+++ b/test/constant.cpp
@@ -19,7 +19,7 @@ using std::vector;
 template<typename T>
 class Constant : public ::testing::Test { };
 
-typedef ::testing::Types<float, af::cfloat, double, af::cdouble, int, unsigned, char, uchar, uintl, intl> TestTypes;
+typedef ::testing::Types<float, af::cfloat, double, af::cdouble, int, unsigned, char, uchar, uintl, intl, short, ushort> TestTypes;
 TYPED_TEST_CASE(Constant, TestTypes);
 
 template<typename T>
diff --git a/test/convolve.cpp b/test/convolve.cpp
index 185eba993e..f3ff9fd6ef 100644
--- a/test/convolve.cpp
+++ b/test/convolve.cpp
@@ -28,7 +28,7 @@ class Convolve : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<cdouble, cfloat, float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<cdouble, cfloat, float, double, int, uint, char, uchar, short, ushort, intl, uintl> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Convolve, TestTypes);
diff --git a/test/corrcoef.cpp b/test/corrcoef.cpp
new file mode 100644
index 0000000000..62454d44da
--- /dev/null
+++ b/test/corrcoef.cpp
@@ -0,0 +1,94 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <string>
+#include <vector>
+#include <ctime>
+#include <iostream>
+#include <algorithm>
+#include <testHelpers.hpp>
+
+using namespace af;
+
+template<typename T>
+class CorrelationCoefficient : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {}
+};
+
+// create a list of types to be tested
+typedef ::testing::Types<float, double, int, uint, intl, uintl, char, uchar> TestTypes;
+
+// register the type list
+TYPED_TEST_CASE(CorrelationCoefficient, TestTypes);
+
+template<typename T>
+struct f32HelperType {
+   typedef typename cond_type<is_same_type<T, double>::value,
+                                             double,
+                                             float>::type type;
+};
+
+template<typename T>
+struct c32HelperType {
+   typedef typename cond_type<is_same_type<T, cfloat>::value,
+                                             cfloat,
+                                             typename f32HelperType<T>::type >::type type;
+};
+
+template<typename T>
+struct elseType {
+   typedef typename cond_type< is_same_type<T, uintl>::value ||
+                               is_same_type<T, intl> ::value,
+                                              double,
+                                              T>::type type;
+};
+
+template<typename T>
+struct ccOutType {
+   typedef typename cond_type< is_same_type<T, float>   ::value ||
+                               is_same_type<T, int>     ::value ||
+                               is_same_type<T, uint>    ::value ||
+                               is_same_type<T, uchar>   ::value ||
+                               is_same_type<T, short>   ::value ||
+                               is_same_type<T, ushort>  ::value ||
+                               is_same_type<T, char>    ::value,
+                                              float,
+                              typename elseType<T>::type>::type type;
+};
+
+TYPED_TEST(CorrelationCoefficient, All)
+{
+    typedef typename ccOutType<TypeParam>::type outType;
+    if (noDoubleTests<TypeParam>()) return;
+    if (noDoubleTests<outType>()) return;
+
+    std::vector<af::dim4>      numDims;
+    std::vector<std::vector<int> >       in;
+    std::vector<std::vector<float> >  tests;
+
+    readTestsFromFile<int,float>(std::string(TEST_DIR "/corrcoef/mat_10x10_scalar.test"),
+                                 numDims, in, tests);
+
+    std::vector<TypeParam> input1(in[0].begin(), in[0].end());
+    std::vector<TypeParam> input2(in[1].begin(), in[1].end());
+
+    array a(numDims[0], &(input1.front()));
+    array b(numDims[1], &(input2.front()));
+    outType c = corrcoef<outType>(a, b);
+
+    std::vector<outType> currGoldBar(tests[0].begin(), tests[0].end());
+    ASSERT_NEAR(::real(currGoldBar[0]), ::real(c), 1.0e-3);
+    ASSERT_NEAR(::imag(currGoldBar[0]), ::imag(c), 1.0e-3);
+}
diff --git a/test/covariance.cpp b/test/covariance.cpp
new file mode 100644
index 0000000000..933f617612
--- /dev/null
+++ b/test/covariance.cpp
@@ -0,0 +1,135 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <string>
+#include <vector>
+#include <ctime>
+#include <iostream>
+#include <algorithm>
+#include <testHelpers.hpp>
+
+using std::string;
+using std::vector;
+using namespace af;
+
+template<typename T>
+class Covariance : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {}
+};
+
+// create a list of types to be tested
+typedef ::testing::Types<float, double, int, uint, intl, uintl, uchar, short, ushort> TestTypes;
+
+// register the type list
+TYPED_TEST_CASE(Covariance, TestTypes);
+
+template<typename T>
+struct f32HelperType {
+   typedef typename cond_type<is_same_type<T, double>::value,
+                                             double,
+                                             float>::type type;
+};
+
+template<typename T>
+struct c32HelperType {
+   typedef typename cond_type<is_same_type<T, cfloat>::value,
+                                             cfloat,
+                                             typename f32HelperType<T>::type >::type type;
+};
+
+template<typename T>
+struct elseType {
+   typedef typename cond_type< is_same_type<T, uintl>::value ||
+                               is_same_type<T, intl> ::value,
+                                              double,
+                                              T>::type type;
+};
+
+template<typename T>
+struct covOutType {
+   typedef typename cond_type< is_same_type<T, float>   ::value ||
+                               is_same_type<T, int>     ::value ||
+                               is_same_type<T, uint>    ::value ||
+                               is_same_type<T, uchar>   ::value ||
+                               is_same_type<T, short>   ::value ||
+                               is_same_type<T, ushort>  ::value ||
+                               is_same_type<T, char>    ::value,
+                                              float,
+                              typename elseType<T>::type>::type type;
+};
+
+template<typename T>
+void covTest(string pFileName, bool isbiased=false)
+{
+    typedef typename covOutType<T>::type outType;
+    if (noDoubleTests<T>()) return;
+    if (noDoubleTests<outType>()) return;
+
+    vector<af::dim4>      numDims;
+    vector<vector<int> >        in;
+    vector<vector<float> >   tests;
+
+    readTestsFromFile<int,float>(pFileName, numDims, in, tests);
+
+    af::dim4 dims1 = numDims[0];
+    af::dim4 dims2 = numDims[1];
+    vector<T> input1(in[0].begin(), in[0].end());
+    vector<T> input2(in[1].begin(), in[1].end());
+
+    array a(dims1, &(input1.front()));
+    array b(dims2, &(input2.front()));
+
+    array c = cov(a, b, isbiased);
+
+    vector<outType> currGoldBar(tests[0].begin(), tests[0].end());
+
+    size_t nElems    = currGoldBar.size();
+    outType *outData = new outType[nElems];
+
+    c.host((void*)outData);
+
+    for (size_t elIter=0; elIter<nElems; ++elIter) {
+        ASSERT_NEAR(::real(currGoldBar[elIter]), ::real(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+        ASSERT_NEAR(::imag(currGoldBar[elIter]), ::imag(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+    }
+
+    // cleanup
+    delete[] outData;
+}
+
+TYPED_TEST(Covariance, Vector)
+{
+    covTest<TypeParam>(string(TEST_DIR "/covariance/vec_size60.test"), false);
+}
+
+TYPED_TEST(Covariance, Matrix)
+{
+    covTest<TypeParam>(string(TEST_DIR "/covariance/matrix_65x121.test"), false);
+}
+
+TEST(Covariance, c32)
+{
+    array a = constant(cfloat(1.0f, -1.0f), 10, c32);
+    array b = constant(cfloat(2.0f, -1.0f), 10, c32);
+    ASSERT_THROW(cov(a, b), af::exception);
+}
+
+TEST(Covariance, c64)
+{
+    if (noDoubleTests<double>()) return;
+    array a = constant(cdouble(1.0, -1.0), 10, c64);
+    array b = constant(cdouble(2.0, -1.0), 10, c64);
+    ASSERT_THROW(cov(a, b), af::exception);
+}
diff --git a/test/data b/test/data
index 0c50b64fb9..db4f6e8062 160000
--- a/test/data
+++ b/test/data
@@ -1 +1 @@
-Subproject commit 0c50b64fb963cef89b26a0664db7260fb92e19c9
+Subproject commit db4f6e80629fb41580ab93208db6b8be958871df
diff --git a/test/diff1.cpp b/test/diff1.cpp
index 7fe19db859..94596816b0 100644
--- a/test/diff1.cpp
+++ b/test/diff1.cpp
@@ -46,7 +46,7 @@ class Diff1 : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, cfloat, double, cdouble, int, unsigned, intl, uintl, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, cfloat, double, cdouble, int, unsigned, intl, uintl, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Diff1, TestTypes);
diff --git a/test/diff2.cpp b/test/diff2.cpp
index 9f7d0cb0a3..3649f7a798 100644
--- a/test/diff2.cpp
+++ b/test/diff2.cpp
@@ -46,7 +46,7 @@ class Diff2 : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, cfloat, double, cdouble, int, unsigned, intl, uintl, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, cfloat, double, cdouble, int, unsigned, intl, uintl, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Diff2, TestTypes);
diff --git a/test/dog.cpp b/test/dog.cpp
index 284a8ad47e..f981bba1a8 100644
--- a/test/dog.cpp
+++ b/test/dog.cpp
@@ -24,7 +24,7 @@ class DOG : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(DOG, TestTypes);
@@ -35,7 +35,7 @@ TYPED_TEST(DOG, Basic)
     if (noDoubleTests<TypeParam>()) return;
 
     af::dim4 iDims(512, 512, 1, 1);
-    af::array in = af::constant<TypeParam>(1, iDims);
+    af::array in = af::constant(1, iDims, (af_dtype)af::dtype_traits<float>::af_type);
     /* calculate DOG using ArrayFire functions */
     af::array k1    = af::gaussianKernel(3, 3);
     af::array k2    = af::gaussianKernel(2, 2);
@@ -54,7 +54,7 @@ TYPED_TEST(DOG, Batch)
     if (noDoubleTests<TypeParam>()) return;
 
     af::dim4 iDims(512, 512, 3, 1);
-    af::array in = af::constant<TypeParam>(1, iDims);
+    af::array in = af::constant(1, iDims, (af_dtype)af::dtype_traits<float>::af_type);
     /* calculate DOG using ArrayFire functions */
     af::array k1    = af::gaussianKernel(3, 3);
     af::array k2    = af::gaussianKernel(2, 2);
diff --git a/test/fast.cpp b/test/fast.cpp
index 2c24f8a961..c13d6da008 100644
--- a/test/fast.cpp
+++ b/test/fast.cpp
@@ -63,7 +63,7 @@ class FixedFAST : public ::testing::Test
 };
 
 typedef ::testing::Types<float, double> FloatTestTypes;
-typedef ::testing::Types<int, unsigned> FixedTestTypes;
+typedef ::testing::Types<int, unsigned, short, ushort> FixedTestTypes;
 
 TYPED_TEST_CASE(FloatFAST, FloatTestTypes);
 TYPED_TEST_CASE(FixedFAST, FixedTestTypes);
diff --git a/test/fftconvolve.cpp b/test/fftconvolve.cpp
index eb0e618deb..cd82ab20d9 100644
--- a/test/fftconvolve.cpp
+++ b/test/fftconvolve.cpp
@@ -35,7 +35,7 @@ class FFTConvolveLarge : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<cfloat, cdouble, float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<cfloat, cdouble, float, double, int, uint, char, uchar, intl, uintl> TestTypes;
 typedef ::testing::Types<float, double> TestTypesLarge;
 
 // register the type list
diff --git a/test/gloh_nonfree.cpp b/test/gloh_nonfree.cpp
new file mode 100644
index 0000000000..2346269734
--- /dev/null
+++ b/test/gloh_nonfree.cpp
@@ -0,0 +1,336 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <af/compatible.h>
+#include <string>
+#include <vector>
+#include <cmath>
+#include <testHelpers.hpp>
+#include <typeinfo>
+
+using std::string;
+using std::vector;
+using af::dim4;
+
+typedef struct
+{
+    float f[5];
+    unsigned d[272];
+} feat_desc_t;
+
+typedef struct
+{
+    float f[5];
+} feat_t;
+
+typedef struct
+{
+    float d[272];
+} desc_t;
+
+bool feat_cmp(feat_desc_t i, feat_desc_t j)
+{
+    for (int k = 0; k < 5; k++)
+        if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f))
+            return (round(i.f[k]*1e1f) < round(j.f[k]*1e1f));
+
+    return true;
+}
+
+void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat)
+{
+    feat.resize(nfeat);
+    for (size_t i = 0; i < feat.size(); i++) {
+        feat[i].f[0] = x[i];
+        feat[i].f[1] = y[i];
+        feat[i].f[2] = score[i];
+        feat[i].f[3] = ori[i];
+        feat[i].f[4] = size[i];
+        for (unsigned j = 0; j < 272; j++)
+            feat[i].d[j] = desc[i * 272 + j];
+    }
+}
+
+void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<float> >& desc, unsigned nfeat)
+{
+    feat.resize(nfeat);
+    for (size_t i = 0; i < feat.size(); i++) {
+        feat[i].f[0] = x[i];
+        feat[i].f[1] = y[i];
+        feat[i].f[2] = score[i];
+        feat[i].f[3] = ori[i];
+        feat[i].f[4] = size[i];
+        for (unsigned j = 0; j < 272; j++)
+            feat[i].d[j] = desc[i][j];
+    }
+}
+
+void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
+{
+    feat.resize(nfeat);
+    for (unsigned i = 0; i < feat.size(); i++) {
+        feat[i].f[0] = x[i];
+        feat[i].f[1] = y[i];
+        feat[i].f[2] = score[i];
+        feat[i].f[3] = ori[i];
+        feat[i].f[4] = size[i];
+    }
+}
+
+void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
+{
+    f.resize(fd.size());
+    d.resize(fd.size());
+    for (size_t i = 0; i < fd.size(); i++) {
+        f[i].f[0] = fd[i].f[0];
+        f[i].f[1] = fd[i].f[1];
+        f[i].f[2] = fd[i].f[2];
+        f[i].f[3] = fd[i].f[3];
+        f[i].f[4] = fd[i].f[4];
+        for (unsigned j = 0; j < 272; j++)
+            d[i].d[j] = fd[i].d[j];
+    }
+}
+
+unsigned popcount(unsigned x)
+{
+    x = x - ((x >> 1) & 0x55555555);
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    x = (x + (x >> 4)) & 0x0F0F0F0F;
+    x = x + (x >> 8);
+    x = x + (x >> 16);
+    return x & 0x0000003F;
+}
+
+bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f)
+{
+    bool ret = true;
+    float sum = 0.0f;
+
+    for (dim_t i = 0; i < ndesc; i++) {
+        sum = 0.0f;
+        for (dim_t l = 0; l < desc_len; l++) {
+            dim_t idx = i * desc_len + l;
+            float x = (cpu[idx] - gpu[idx]);
+            sum += x*x;
+            if (abs(x) > (float)unit_thr) {
+                ret = false;
+                std::cout<<std::endl<<"@compareEuclidean: unit mismatch."<<std::endl;
+                std::cout<<"(cpu,gpu,cpu-gpu)["<<i<<","<<l<<"] : {"<<cpu[idx]<<","<<gpu[idx]<<","<<cpu[idx]-gpu[idx]<<"}"<<std::endl;
+                std::cout<<std::endl;
+                break;
+            }
+        }
+        if (sqrt(sum) > euc_thr) {
+            ret = false;
+            std::cout<<std::endl<<"@compareEuclidean: distance mismatch."<<std::endl;
+            std::cout<<"Euclidean distance: "<<sqrt(sum)<<std::endl;
+        }
+        if (ret == false)
+            return ret;
+    }
+
+    return ret;
+}
+
+template<typename T>
+class GLOH : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {}
+};
+
+typedef ::testing::Types<float, double> TestTypes;
+
+TYPED_TEST_CASE(GLOH, TestTypes);
+
+template<typename T>
+void glohTest(string pTestFile)
+{
+#ifdef AF_BUILD_SIFT
+    if (noDoubleTests<T>()) return;
+
+    vector<dim4>           inDims;
+    vector<string>         inFiles;
+    vector<vector<float> > goldFeat;
+    vector<vector<float> > goldDesc;
+
+    readImageFeaturesDescriptors<float>(pTestFile, inDims, inFiles, goldFeat, goldDesc);
+
+    size_t testCount = inDims.size();
+
+    for (size_t testId=0; testId<testCount; ++testId) {
+        af_array inArray_f32  = 0;
+        af_array inArray      = 0;
+        af_array desc         = 0;
+        af_features feat;
+
+        inFiles[testId].insert(0,string(TEST_DIR"/gloh/"));
+
+        ASSERT_EQ(AF_SUCCESS, af_load_image(&inArray_f32, inFiles[testId].c_str(), false));
+        ASSERT_EQ(AF_SUCCESS, conv_image<T>(&inArray, inArray_f32));
+
+        ASSERT_EQ(AF_SUCCESS, af_gloh(&feat, &desc, inArray, 3, 0.04f, 10.0f, 1.6f, true, 1.f/256.f, 0.05f));
+
+        dim_t n = 0;
+        af_array x, y, score, orientation, size;
+
+        ASSERT_EQ(AF_SUCCESS, af_get_features_num(&n, feat));
+        ASSERT_EQ(AF_SUCCESS, af_get_features_xpos(&x, feat));
+        ASSERT_EQ(AF_SUCCESS, af_get_features_ypos(&y, feat));
+        ASSERT_EQ(AF_SUCCESS, af_get_features_score(&score, feat));
+        ASSERT_EQ(AF_SUCCESS, af_get_features_orientation(&orientation, feat));
+        ASSERT_EQ(AF_SUCCESS, af_get_features_size(&size, feat));
+
+        float * outX           = new float[n];
+        float * outY           = new float[n];
+        float * outScore       = new float[n];
+        float * outOrientation = new float[n];
+        float * outSize        = new float[n];
+        dim_t descSize;
+        dim_t descDims[4];
+        ASSERT_EQ(AF_SUCCESS, af_get_elements(&descSize, desc));
+        ASSERT_EQ(AF_SUCCESS, af_get_dims(&descDims[0], &descDims[1], &descDims[2], &descDims[3], desc));
+        float * outDesc     = new float[descSize];
+        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outX, x));
+        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outY, y));
+        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outScore, score));
+        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outOrientation, orientation));
+        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outSize, size));
+        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outDesc, desc));
+
+        vector<feat_desc_t> out_feat_desc;
+        array_to_feat_desc(out_feat_desc, outX, outY, outScore, outOrientation, outSize, outDesc, n);
+
+        vector<feat_desc_t> gold_feat_desc;
+        array_to_feat_desc(gold_feat_desc, &goldFeat[0].front(), &goldFeat[1].front(), &goldFeat[2].front(), &goldFeat[3].front(), &goldFeat[4].front(), goldDesc, goldFeat[0].size());
+
+        std::stable_sort(out_feat_desc.begin(), out_feat_desc.end(), feat_cmp);
+        std::stable_sort(gold_feat_desc.begin(), gold_feat_desc.end(), feat_cmp);
+
+        vector<feat_t> out_feat;
+        vector<desc_t> v_out_desc;
+        vector<feat_t> gold_feat;
+        vector<desc_t> v_gold_desc;
+
+        split_feat_desc(out_feat_desc, out_feat, v_out_desc);
+        split_feat_desc(gold_feat_desc, gold_feat, v_gold_desc);
+
+        for (int elIter = 0; elIter < (int)n; elIter++) {
+            ASSERT_LE(fabs(out_feat[elIter].f[0] - gold_feat[elIter].f[0]), 1e-3) << "at: " << elIter << std::endl;
+            ASSERT_LE(fabs(out_feat[elIter].f[1] - gold_feat[elIter].f[1]), 1e-3) << "at: " << elIter << std::endl;
+            ASSERT_LE(fabs(out_feat[elIter].f[2] - gold_feat[elIter].f[2]), 1e-3) << "at: " << elIter << std::endl;
+            ASSERT_LE(fabs(out_feat[elIter].f[3] - gold_feat[elIter].f[3]), 0.5f) << "at: " << elIter << std::endl;
+            ASSERT_LE(fabs(out_feat[elIter].f[4] - gold_feat[elIter].f[4]), 1e-3) << "at: " << elIter << std::endl;
+        }
+
+        EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 2.f, 5.5f));
+
+        ASSERT_EQ(AF_SUCCESS, af_release_array(inArray));
+        ASSERT_EQ(AF_SUCCESS, af_release_array(inArray_f32));
+
+        ASSERT_EQ(AF_SUCCESS, af_release_array(x));
+        ASSERT_EQ(AF_SUCCESS, af_release_array(y));
+        ASSERT_EQ(AF_SUCCESS, af_release_array(score));
+        ASSERT_EQ(AF_SUCCESS, af_release_array(orientation));
+        ASSERT_EQ(AF_SUCCESS, af_release_array(size));
+        ASSERT_EQ(AF_SUCCESS, af_release_array(desc));
+
+        delete[] outX;
+        delete[] outY;
+        delete[] outScore;
+        delete[] outOrientation;
+        delete[] outSize;
+        delete[] outDesc;
+    }
+#endif
+}
+
+#define GLOH_INIT(desc, image) \
+    TYPED_TEST(GLOH, desc) \
+    {   \
+        glohTest<TypeParam>(string(TEST_DIR"/gloh/"#image".test"));   \
+    }
+
+    GLOH_INIT(man, man);
+
+///////////////////////////////////// CPP ////////////////////////////////
+//
+TEST(GLOH, CPP)
+{
+#ifdef AF_BUILD_SIFT
+    if (noDoubleTests<float>()) return;
+
+    vector<dim4>           inDims;
+    vector<string>         inFiles;
+    vector<vector<float> > goldFeat;
+    vector<vector<float> > goldDesc;
+
+    readImageFeaturesDescriptors<float>(string(TEST_DIR"/gloh/man.test"), inDims, inFiles, goldFeat, goldDesc);
+    inFiles[0].insert(0,string(TEST_DIR"/gloh/"));
+
+    af::array in = af::loadImage(inFiles[0].c_str(), false);
+
+    af::features feat;
+    af::array desc;
+    af::gloh(feat, desc, in, 3, 0.04f, 10.0f, 1.6f, true, 1.f/256.f, 0.05f);
+
+    float * outX           = new float[feat.getNumFeatures()];
+    float * outY           = new float[feat.getNumFeatures()];
+    float * outScore       = new float[feat.getNumFeatures()];
+    float * outOrientation = new float[feat.getNumFeatures()];
+    float * outSize        = new float[feat.getNumFeatures()];
+    float * outDesc        = new float[desc.elements()];
+    af::dim4 descDims = desc.dims();
+    feat.getX().host(outX);
+    feat.getY().host(outY);
+    feat.getScore().host(outScore);
+    feat.getOrientation().host(outOrientation);
+    feat.getSize().host(outSize);
+    desc.host(outDesc);
+
+    vector<feat_desc_t> out_feat_desc;
+    array_to_feat_desc(out_feat_desc, outX, outY, outScore, outOrientation, outSize, outDesc, feat.getNumFeatures());
+
+    vector<feat_desc_t> gold_feat_desc;
+    array_to_feat_desc(gold_feat_desc, &goldFeat[0].front(), &goldFeat[1].front(), &goldFeat[2].front(), &goldFeat[3].front(), &goldFeat[4].front(), goldDesc, goldFeat[0].size());
+
+    std::stable_sort(out_feat_desc.begin(), out_feat_desc.end(), feat_cmp);
+    std::stable_sort(gold_feat_desc.begin(), gold_feat_desc.end(), feat_cmp);
+
+    vector<feat_t> out_feat;
+    vector<desc_t> v_out_desc;
+    vector<feat_t> gold_feat;
+    vector<desc_t> v_gold_desc;
+
+    split_feat_desc(out_feat_desc, out_feat, v_out_desc);
+    split_feat_desc(gold_feat_desc, gold_feat, v_gold_desc);
+
+    for (int elIter = 0; elIter < (int)feat.getNumFeatures(); elIter++) {
+        ASSERT_LE(fabs(out_feat[elIter].f[0] - gold_feat[elIter].f[0]), 1e-3) << "at: " << elIter << std::endl;
+        ASSERT_LE(fabs(out_feat[elIter].f[1] - gold_feat[elIter].f[1]), 1e-3) << "at: " << elIter << std::endl;
+        ASSERT_LE(fabs(out_feat[elIter].f[2] - gold_feat[elIter].f[2]), 1e-3) << "at: " << elIter << std::endl;
+        ASSERT_LE(fabs(out_feat[elIter].f[3] - gold_feat[elIter].f[3]), 0.5f) << "at: " << elIter << std::endl;
+        ASSERT_LE(fabs(out_feat[elIter].f[4] - gold_feat[elIter].f[4]), 1e-3) << "at: " << elIter << std::endl;
+    }
+
+    EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 2.f, 5.5f));
+
+    delete[] outX;
+    delete[] outY;
+    delete[] outScore;
+    delete[] outOrientation;
+    delete[] outSize;
+    delete[] outDesc;
+#endif
+}
diff --git a/test/hamming.cpp b/test/hamming.cpp
index 042ff30fd6..5b359b74d7 100644
--- a/test/hamming.cpp
+++ b/test/hamming.cpp
@@ -35,8 +35,8 @@ class HammingMatcher32 : public ::testing::Test
 };
 
 // create lists of types to be tested
-typedef ::testing::Types<uchar> TestTypes8;
-typedef ::testing::Types<uint> TestTypes32;
+typedef ::testing::Types<uchar, ushort> TestTypes8;
+typedef ::testing::Types<uint, uintl> TestTypes32;
 
 // register the type list
 TYPED_TEST_CASE(HammingMatcher8,  TestTypes8);
diff --git a/test/histogram.cpp b/test/histogram.cpp
index a9a820f4de..f1d7af51b9 100644
--- a/test/histogram.cpp
+++ b/test/histogram.cpp
@@ -27,7 +27,7 @@ class Histogram : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, short, ushort, intl, uintl> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Histogram, TestTypes);
diff --git a/test/homography.cpp b/test/homography.cpp
new file mode 100644
index 0000000000..7be9e07473
--- /dev/null
+++ b/test/homography.cpp
@@ -0,0 +1,277 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <af/compatible.h>
+#include <string>
+#include <vector>
+#include <cmath>
+#include <testHelpers.hpp>
+#include <typeinfo>
+
+using std::string;
+using std::vector;
+using af::dim4;
+
+template<typename T>
+class Homography : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {}
+};
+
+typedef ::testing::Types<float, double> TestTypes;
+
+TYPED_TEST_CASE(Homography, TestTypes);
+
+template<typename T>
+af::array perspectiveTransform(af::dim4 inDims, af::array H)
+{
+    T d0 = (T)inDims[0];
+    T d1 = (T)inDims[1];
+    af::dim4 dims(4, 3);
+    T h_in[4*3] = { (T)0, (T)0,  (T)d1, (T)d1,
+                    (T)0, (T)d0, (T)d0, (T)0,
+                    (T)1, (T)1,  (T)1,  (T)1 };
+
+    af::array in(dims, h_in);
+
+    af::array w = 1.f / af::matmul(in, H(af::span, 2));
+    af::array xt = af::matmul(in, H(af::span, 0)) * w;
+    af::array yt = af::matmul(in, H(af::span, 1)) * w;
+
+    af::array t = join(1, xt, yt);
+
+    return t;
+}
+
+template<typename T>
+void homographyTest(string pTestFile, const af_homography_type htype,
+                    const bool rotate, const float size_ratio)
+{
+    if (noDoubleTests<T>()) return;
+
+    vector<dim4>           inDims;
+    vector<string>         inFiles;
+    vector<vector<float> > gold;
+
+    readImageTests(pTestFile, inDims, inFiles, gold);
+
+    inFiles[0].insert(0,string(TEST_DIR"/homography/"));
+
+    af_array trainArray_f32   = 0;
+    af_array trainArray       = 0;
+    af_array train_desc       = 0;
+    af_array train_feat_x     = 0;
+    af_array train_feat_y     = 0;
+    af_features train_feat;
+
+    ASSERT_EQ(AF_SUCCESS, af_load_image(&trainArray_f32, inFiles[0].c_str(), false));
+    ASSERT_EQ(AF_SUCCESS, conv_image<T>(&trainArray, trainArray_f32));
+
+    ASSERT_EQ(AF_SUCCESS, af_orb(&train_feat, &train_desc, trainArray, 20.0f, 2000, 1.2f, 8, true));
+
+    ASSERT_EQ(AF_SUCCESS, af_get_features_xpos(&train_feat_x, train_feat));
+    ASSERT_EQ(AF_SUCCESS, af_get_features_ypos(&train_feat_y, train_feat));
+
+    af_array queryArray       = 0;
+    af_array query_desc       = 0;
+    af_array idx              = 0;
+    af_array dist             = 0;
+    af_array const_50         = 0;
+    af_array dist_thr         = 0;
+    af_array train_idx        = 0;
+    af_array query_idx        = 0;
+    af_array query_feat_x     = 0;
+    af_array query_feat_y     = 0;
+    af_array H                = 0;
+    af_array train_feat_x_idx = 0;
+    af_array train_feat_y_idx = 0;
+    af_array query_feat_x_idx = 0;
+    af_array query_feat_y_idx = 0;
+    af_features query_feat;
+
+    const float theta = af::Pi * 0.5f;
+    const dim_t test_d0 = inDims[0][0] * size_ratio;
+    const dim_t test_d1 = inDims[0][1] * size_ratio;
+    if (rotate)
+        ASSERT_EQ(AF_SUCCESS, af_rotate(&queryArray, trainArray, theta, false, AF_INTERP_NEAREST));
+    else
+        ASSERT_EQ(AF_SUCCESS, af_resize(&queryArray, trainArray, test_d0, test_d1, AF_INTERP_BILINEAR));
+
+    ASSERT_EQ(AF_SUCCESS, af_orb(&query_feat, &query_desc, queryArray, 20.0f, 2000, 1.2f, 8, true));
+
+    ASSERT_EQ(AF_SUCCESS, af_hamming_matcher(&idx, &dist, train_desc, query_desc, 0, 1));
+
+    dim_t distDims[4];
+    ASSERT_EQ(AF_SUCCESS, af_get_dims(&distDims[0], &distDims[1], &distDims[2], &distDims[3], dist));
+
+    ASSERT_EQ(AF_SUCCESS, af_constant(&const_50, 50, 2, distDims, u32));
+    ASSERT_EQ(AF_SUCCESS, af_lt(&dist_thr, dist, const_50, false));
+    ASSERT_EQ(AF_SUCCESS, af_where(&train_idx, dist_thr));
+
+    dim_t tidxDims[4];
+    ASSERT_EQ(AF_SUCCESS, af_get_dims(&tidxDims[0], &tidxDims[1], &tidxDims[2], &tidxDims[3], train_idx));
+    af_index_t tindexs;
+    tindexs.isSeq = false;
+    tindexs.idx.seq = af_make_seq(0, tidxDims[0]-1, 1);
+    tindexs.idx.arr = train_idx;
+    ASSERT_EQ(AF_SUCCESS, af_index_gen(&query_idx, idx, 1, &tindexs));
+
+    ASSERT_EQ(AF_SUCCESS, af_get_features_xpos(&query_feat_x, query_feat));
+    ASSERT_EQ(AF_SUCCESS, af_get_features_ypos(&query_feat_y, query_feat));
+
+    dim_t qidxDims[4];
+    ASSERT_EQ(AF_SUCCESS, af_get_dims(&qidxDims[0], &qidxDims[1], &qidxDims[2], &qidxDims[3], query_idx));
+    af_index_t qindexs;
+    qindexs.isSeq = false;
+    qindexs.idx.seq = af_make_seq(0, qidxDims[0]-1, 1);
+    qindexs.idx.arr = query_idx;
+
+    ASSERT_EQ(AF_SUCCESS, af_index_gen(&train_feat_x_idx, train_feat_x, 1, &tindexs));
+    ASSERT_EQ(AF_SUCCESS, af_index_gen(&train_feat_y_idx, train_feat_y, 1, &tindexs));
+    ASSERT_EQ(AF_SUCCESS, af_index_gen(&query_feat_x_idx, query_feat_x, 1, &qindexs));
+    ASSERT_EQ(AF_SUCCESS, af_index_gen(&query_feat_y_idx, query_feat_y, 1, &qindexs));
+
+    int inliers = 0;
+    ASSERT_EQ(AF_SUCCESS, af_homography(&H, &inliers, train_feat_x_idx, train_feat_y_idx,
+                                        query_feat_x_idx, query_feat_y_idx, htype,
+                                        3.0f, 1000, (af_dtype) af::dtype_traits<T>::af_type));
+
+    af::array HH(H);
+
+    af::array t = perspectiveTransform<T>(inDims[0], HH);
+
+    T* gold_t = new T[8];
+    for (int i = 0; i < 8; i++)
+        gold_t[i] = (T)0;
+    if (rotate) {
+        gold_t[1] = test_d0;
+        gold_t[2] = test_d0;
+        gold_t[4] = test_d1;
+        gold_t[5] = test_d1;
+    } else {
+        gold_t[2] = test_d1;
+        gold_t[3] = test_d1;
+        gold_t[5] = test_d0;
+        gold_t[6] = test_d0;
+    }
+
+    T* out_t = new T[8];
+    t.host(out_t);
+
+    for (int elIter = 0; elIter < 8; elIter++)
+        ASSERT_LE(fabs(out_t[elIter] - gold_t[elIter]), 70.f) << "at: " << elIter << std::endl;
+
+    delete[] gold_t;
+    delete[] out_t;
+
+    ASSERT_EQ(AF_SUCCESS, af_release_array(queryArray));
+
+    ASSERT_EQ(AF_SUCCESS, af_release_array(query_desc));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(idx));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(dist));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(const_50));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(dist_thr));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(train_idx));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(query_idx));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(query_feat_x));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(query_feat_y));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(train_feat_x_idx));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(train_feat_y_idx));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(query_feat_x_idx));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(query_feat_y_idx));
+
+    ASSERT_EQ(AF_SUCCESS, af_release_array(trainArray));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(trainArray_f32));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(train_desc));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(train_feat_x));
+    ASSERT_EQ(AF_SUCCESS, af_release_array(train_feat_y));
+}
+
+#define HOMOGRAPHY_INIT(desc, image, htype, rotate, size_ratio)                 \
+    TYPED_TEST(Homography, desc)                                                \
+    {                                                                           \
+        homographyTest<TypeParam>(string(TEST_DIR"/homography/"#image".test"),  \
+                                  htype, rotate, size_ratio);                   \
+    }
+
+    HOMOGRAPHY_INIT(Tux_RANSAC, tux, AF_HOMOGRAPHY_RANSAC, false, 1.0f);
+    HOMOGRAPHY_INIT(Tux_RANSAC_90degrees, tux, AF_HOMOGRAPHY_RANSAC, true, 1.0f);
+    HOMOGRAPHY_INIT(Tux_RANSAC_resize, tux, AF_HOMOGRAPHY_RANSAC, false, 1.5f);
+    //HOMOGRAPHY_INIT(Tux_LMedS, tux, AF_HOMOGRAPHY_LMEDS, false, 1.0f);
+    //HOMOGRAPHY_INIT(Tux_LMedS_90degrees, tux, AF_HOMOGRAPHY_LMEDS, true, 1.0f);
+    //HOMOGRAPHY_INIT(Tux_LMedS_resize, tux, AF_HOMOGRAPHY_LMEDS, false, 1.5f);
+
+///////////////////////////////////// CPP ////////////////////////////////
+//
+TEST(Homography, CPP)
+{
+    vector<dim4>           inDims;
+    vector<string>         inFiles;
+    vector<vector<float> > gold;
+
+    readImageTests(string(TEST_DIR"/homography/tux.test"), inDims, inFiles, gold);
+
+    inFiles[0].insert(0,string(TEST_DIR"/homography/"));
+
+    const float size_ratio = 0.5f;
+
+    af::array train_img = af::loadImage(inFiles[0].c_str(), false);
+    af::array query_img = af::resize(size_ratio, train_img);
+    af::dim4 tDims = train_img.dims();
+
+    af::features feat_train, feat_query;
+    af::array desc_train, desc_query;
+    orb(feat_train, desc_train, train_img, 20, 2000, 1.2, 8, true);
+    orb(feat_query, desc_query, query_img, 20, 2000, 1.2, 8, true);
+
+    af::array idx, dist;
+    af::hammingMatcher(idx, dist, desc_train, desc_query, 0, 1);
+
+    af::array train_idx = where(dist < 30);
+    af::array query_idx = idx(train_idx);
+
+    af::array feat_train_x = feat_train.getX()(train_idx);
+    af::array feat_train_y = feat_train.getY()(train_idx);
+    af::array feat_train_score = feat_train.getScore()(train_idx);
+    af::array feat_train_orientation = feat_train.getOrientation()(train_idx);
+    af::array feat_train_size = feat_train.getSize()(train_idx);
+    af::array feat_query_x = feat_query.getX()(query_idx);
+    af::array feat_query_y = feat_query.getY()(query_idx);
+    af::array feat_query_score = feat_query.getScore()(query_idx);
+    af::array feat_query_orientation = feat_query.getOrientation()(query_idx);
+    af::array feat_query_size = feat_query.getSize()(query_idx);
+
+    af::array H;
+    int inliers = 0;
+    af::homography(H, inliers, feat_train_x, feat_train_y, feat_query_x, feat_query_y, AF_HOMOGRAPHY_RANSAC, 3.0f, 1000, f32);
+
+    float* gold_t = new float[8];
+    for (int i = 0; i < 8; i++)
+        gold_t[i] = 0.f;
+    gold_t[2] = tDims[1] * size_ratio;
+    gold_t[3] = tDims[1] * size_ratio;
+    gold_t[5] = tDims[0] * size_ratio;
+    gold_t[6] = tDims[0] * size_ratio;
+
+    af::array t = perspectiveTransform<float>(train_img.dims(), H);
+
+    float* out_t = new float[4*2];
+    t.host(out_t);
+
+    for (int elIter = 0; elIter < 8; elIter++)
+        ASSERT_LE(fabs(out_t[elIter] - gold_t[elIter]), 70.f) << "at: " << elIter << std::endl;
+
+    delete[] gold_t;
+    delete[] out_t;
+}
diff --git a/test/imageio.cpp b/test/imageio.cpp
index 20d1b43e7c..a826bb8cf8 100644
--- a/test/imageio.cpp
+++ b/test/imageio.cpp
@@ -56,10 +56,18 @@ void loadImageTest(string pTestFile, string pImageFile, const bool isColor)
     float *imgData = new float[dims.elements()];
     ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*) imgData, imgArray));
 
+    bool isJPEG = false;
+    if(pImageFile.find(".jpg") != std::string::npos) {
+        isJPEG = true;
+    }
+
     // Compare result
     size_t nElems = in[0].size();
     for (size_t elIter = 0; elIter < nElems; ++elIter) {
-        ASSERT_EQ(in[0][elIter], imgData[elIter]) << "at: " << elIter << std::endl;
+        if(isJPEG)  // Allow +- 1 because of compression when testing JPG
+            ASSERT_NEAR(in[0][elIter], imgData[elIter], 1) << "at: " << elIter << std::endl;
+        else
+            ASSERT_EQ(in[0][elIter], imgData[elIter]) << "at: " << elIter << std::endl;
     }
 
     // Delete
diff --git a/test/index.cpp b/test/index.cpp
index 6a798aab08..d6d1a64709 100644
--- a/test/index.cpp
+++ b/test/index.cpp
@@ -126,7 +126,7 @@ class Indexing1D : public ::testing::Test
     vector<af_seq> span_seqs;
 };
 
-typedef ::testing::Types<float, double, af::cfloat, af::cdouble, int, unsigned, unsigned char, intl, uintl> AllTypes;
+typedef ::testing::Types<float, double, af::cfloat, af::cdouble, int, unsigned, unsigned char, intl, uintl, short, ushort> AllTypes;
 TYPED_TEST_CASE(Indexing1D, AllTypes);
 
 TYPED_TEST(Indexing1D, Continious)          { DimCheck<TypeParam>(this->continuous_seqs);           }
@@ -549,7 +549,7 @@ class lookup : public ::testing::Test
         }
 };
 
-typedef ::testing::Types<float, double, int, unsigned, unsigned char> ArrIdxTestTypes;
+typedef ::testing::Types<float, double, int, unsigned, unsigned char, short, ushort, intl, uintl> ArrIdxTestTypes;
 TYPED_TEST_CASE(lookup, ArrIdxTestTypes);
 
 template<typename T>
@@ -1369,3 +1369,20 @@ TEST(Asssign, LinearIndexGenArr)
         ASSERT_EQ(ha[i + st], hout[i]);
     }
 }
+
+TEST(Index, OutOfBounds)
+{
+    using af::array;
+
+    uint gold[7] = {0, 9, 49, 119, 149, 149, 148};
+    uint h_idx[7] = {0, 9, 49, 119, 149, 150, 151};
+    uint output[7];
+
+    array a = af::iota(af::dim4(50, 1, 3)).as(s32);
+    array idx(7, h_idx);
+    array b = a(idx);
+    b.host((void*)output);
+
+    for(int i=0; i<7; ++i)
+        ASSERT_EQ(gold[i], output[i]);
+}
diff --git a/test/iota.cpp b/test/iota.cpp
index 1c1ca6c116..e91741d199 100644
--- a/test/iota.cpp
+++ b/test/iota.cpp
@@ -38,7 +38,7 @@ class Iota : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, unsigned int, intl, uintl, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, int, unsigned int, intl, uintl, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Iota, TestTypes);
diff --git a/test/ireduce.cpp b/test/ireduce.cpp
index aa2b66df75..c0536be267 100644
--- a/test/ireduce.cpp
+++ b/test/ireduce.cpp
@@ -14,7 +14,6 @@
 #include <testHelpers.hpp>
 #include <algorithm>
 
-using namespace std;
 using namespace af;
 
 
@@ -27,14 +26,14 @@ using namespace af;
         const int ny = 100;                             \
         af::array in = randu(nx, ny, dty);              \
         af::array val, idx;                             \
-        fn(val, idx, in, 0);                            \
+        af::fn(val, idx, in, 0);                        \
                                                         \
         ty *h_in = in.host<ty>();                       \
         ty *h_in_st = h_in;                             \
         ty *h_val = val.host<ty>();                     \
         uint *h_idx = idx.host<uint>();                 \
         for (int i = 0; i < ny; i++) {                  \
-            ty tmp = *fn##_element(h_in, h_in + nx);    \
+            ty tmp = *std::fn##_element(h_in, h_in +nx);\
             ASSERT_EQ(tmp, h_val[i])                    \
                 << "for index" << i;                    \
             ASSERT_EQ(h_in[h_idx[i]], tmp)              \
@@ -53,7 +52,7 @@ using namespace af;
         const int ny = 100;                             \
         af::array in = randu(nx, ny, dty);              \
         af::array val, idx;                             \
-        fn(val, idx, in, 1);                            \
+        af::fn(val, idx, in, 1);                        \
                                                         \
         ty *h_in = in.host<ty>();                       \
         ty *h_val = val.host<ty>();                     \
@@ -61,7 +60,7 @@ using namespace af;
         for (int i = 0; i < nx; i++) {                  \
             ty val = h_val[i];                          \
             for (int j= 0; j < ny; j++) {               \
-                ty tmp = fn(val, h_in[j * nx + i]);     \
+                ty tmp = std::fn(val, h_in[j * nx + i]);\
                 ASSERT_EQ(tmp, val);                    \
             }                                           \
             ASSERT_EQ(val, h_in[h_idx[i] * nx + i]);    \
@@ -78,9 +77,9 @@ using namespace af;
         af::array in = randu(num, dty);                 \
         ty val;                                         \
         uint idx;                                       \
-        fn<ty>(&val, &idx, in);                         \
+        af::fn<ty>(&val, &idx, in);                     \
         ty *h_in = in.host<ty>();                       \
-        ty tmp = *fn##_element(h_in, h_in + num);       \
+        ty tmp = *std::fn##_element(h_in, h_in + num);  \
         ASSERT_EQ(tmp, val);                            \
         ASSERT_EQ(tmp, h_in[idx]);                      \
         delete[] h_in;                                  \
diff --git a/test/join.cpp b/test/join.cpp
index 01014456ab..0c5b1bf62c 100644
--- a/test/join.cpp
+++ b/test/join.cpp
@@ -39,7 +39,7 @@ class Join : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Join, TestTypes);
diff --git a/test/match_template.cpp b/test/match_template.cpp
index 083bdca217..adebea4ac1 100644
--- a/test/match_template.cpp
+++ b/test/match_template.cpp
@@ -26,7 +26,7 @@ class MatchTemplate : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(MatchTemplate, TestTypes);
diff --git a/test/mean.cpp b/test/mean.cpp
index 15a2c359c4..e3f7031747 100644
--- a/test/mean.cpp
+++ b/test/mean.cpp
@@ -13,6 +13,9 @@
 #include <af/traits.hpp>
 #include <string>
 #include <vector>
+#include <ctime>
+#include <iostream>
+#include <algorithm>
 #include <testHelpers.hpp>
 
 using std::string;
@@ -28,7 +31,7 @@ class Mean : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<cdouble, cfloat, float, double, int, uint, intl, uintl, char, uchar> TestTypes;
+typedef ::testing::Types<cdouble, cfloat, float, double, int, uint, intl, uintl, char, uchar, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Mean, TestTypes);
@@ -50,24 +53,26 @@ struct c32HelperType {
 template<typename T>
 struct elseType {
    typedef typename cond_type< is_same_type<T, uintl>::value ||
-                               is_same_type<T, intl>::value,
+                               is_same_type<T, intl> ::value,
                                               double,
                                               T>::type type;
 };
 
 template<typename T>
 struct meanOutType {
-   typedef typename cond_type< is_same_type<T, float>::value ||
-                               is_same_type<T, int>::value ||
-                               is_same_type<T, uint>::value ||
-                               is_same_type<T, uchar>::value ||
-                               is_same_type<T, char>::value,
+   typedef typename cond_type< is_same_type<T, float>   ::value ||
+                               is_same_type<T, int>     ::value ||
+                               is_same_type<T, uint>    ::value ||
+                               is_same_type<T, uchar>   ::value ||
+                               is_same_type<T, short>   ::value ||
+                               is_same_type<T, ushort>  ::value ||
+                               is_same_type<T, char>    ::value,
                                               float,
                               typename elseType<T>::type>::type type;
 };
 
 template<typename T>
-void meanDimTest(string pFileName, dim_t dim)
+void meanDimTest(string pFileName, dim_t dim, bool isWeighted=false)
 {
     typedef typename meanOutType<T>::type outType;
     if (noDoubleTests<T>()) return;
@@ -79,71 +84,96 @@ void meanDimTest(string pFileName, dim_t dim)
 
     readTestsFromFile<int,float>(pFileName, numDims, in, tests);
 
-    af::dim4 dims      = numDims[0];
-    af_array outArray  = 0;
-    af_array inArray   = 0;
+    if (!isWeighted) {
+        af::dim4 dims = numDims[0];
+        vector<T> input(in[0].begin(), in[0].end());
 
-    vector<T> input(in[0].begin(), in[0].end());
+        af::array inArray(dims, &(input.front()));
 
-    ASSERT_EQ(AF_SUCCESS, af_create_array(&inArray, &(input.front()),
-                dims.ndims(), dims.get(), (af_dtype)af::dtype_traits<T>::af_type));
+        af::array outArray = af::mean(inArray, dim);
 
-    ASSERT_EQ(AF_SUCCESS, af_mean(&outArray, inArray, dim));
+        outType *outData = new outType[dims.elements()];
 
-    outType *outData = new outType[dims.elements()];
+        outArray.host((void*)outData);
 
-    ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray));
+        vector<outType> currGoldBar(tests[0].begin(), tests[0].end());
+        size_t nElems = currGoldBar.size();
+        for (size_t elIter=0; elIter<nElems; ++elIter) {
+            ASSERT_NEAR(::real(currGoldBar[elIter]), ::real(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+            ASSERT_NEAR(::imag(currGoldBar[elIter]), ::imag(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+        }
 
-    vector<outType> currGoldBar(tests[0].begin(), tests[0].end());
-    size_t nElems = currGoldBar.size();
-    for (size_t elIter=0; elIter<nElems; ++elIter) {
-        ASSERT_NEAR(::real(currGoldBar[elIter]), ::real(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
-        ASSERT_NEAR(::imag(currGoldBar[elIter]), ::imag(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
-    }
+        // cleanup
+        delete[] outData;
+    } else {
+        af::dim4 dims  = numDims[0];
+        af::dim4 wdims = numDims[1];
+        vector<T> input(in[0].begin(), in[0].end());
+        vector<float> weights(in[1].begin(), in[1].end());
+
+        af::array inArray(dims, &(input.front()));
+        af::array wtsArray(wdims, &(weights.front()));
+
+        af::array outArray = af::mean(inArray, wtsArray, dim);
+
+        outType *outData = new outType[dims.elements()];
+
+        outArray.host((void*)outData);
+
+        vector<outType> currGoldBar(tests[0].begin(), tests[0].end());
+        size_t nElems = currGoldBar.size();
+        for (size_t elIter=0; elIter<nElems; ++elIter) {
+            ASSERT_NEAR(::real(currGoldBar[elIter]), ::real(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+            ASSERT_NEAR(::imag(currGoldBar[elIter]), ::imag(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+        }
 
-    // cleanup
-    delete[] outData;
-    ASSERT_EQ(AF_SUCCESS, af_release_array(inArray));
-    ASSERT_EQ(AF_SUCCESS, af_release_array(outArray));
+        // cleanup
+        delete[] outData;
+    }
 }
 
 TYPED_TEST(Mean, Dim0Matrix)
 {
-    meanDimTest<TypeParam>(string(TEST_DIR"/mean/mean_dim0_matrix.test"), 0);
+    meanDimTest<TypeParam>(string(TEST_DIR "/mean/mean_dim0_matrix.test"), 0);
 }
 
 TYPED_TEST(Mean, Dim1Cube)
 {
-    meanDimTest<TypeParam>(string(TEST_DIR"/mean/mean_dim1_cube.test"), 1);
+    meanDimTest<TypeParam>(string(TEST_DIR "/mean/mean_dim1_cube.test"), 1);
 }
 
 TYPED_TEST(Mean, Dim0HyperCube)
 {
-    meanDimTest<TypeParam>(string(TEST_DIR"/mean/mean_dim0_hypercube.test"), 0);
+    meanDimTest<TypeParam>(string(TEST_DIR "/mean/mean_dim0_hypercube.test"), 0);
 }
 
 TYPED_TEST(Mean, Dim2Matrix)
 {
-    meanDimTest<TypeParam>(string(TEST_DIR"/mean/mean_dim2_matrix.test"), 2);
+    meanDimTest<TypeParam>(string(TEST_DIR "/mean/mean_dim2_matrix.test"), 2);
 }
 
 TYPED_TEST(Mean, Dim2Cube)
 {
-    meanDimTest<TypeParam>(string(TEST_DIR"/mean/mean_dim2_cube.test"), 2);
+    meanDimTest<TypeParam>(string(TEST_DIR "/mean/mean_dim2_cube.test"), 2);
 }
 
 TYPED_TEST(Mean, Dim2HyperCube)
 {
-    meanDimTest<TypeParam>(string(TEST_DIR"/mean/mean_dim2_hypercube.test"), 2);
+    meanDimTest<TypeParam>(string(TEST_DIR "/mean/mean_dim2_hypercube.test"), 2);
 }
 
-//////////////////////////////// CPP ////////////////////////////////////
-// test mean_all interface using cpp api
+TYPED_TEST(Mean, Wtd_Dim0Matrix)
+{
+    meanDimTest<TypeParam>(string(TEST_DIR "/mean/wtd_mean_dim0_mat.test"), 0, true);
+}
 
-#include <iostream>
+TYPED_TEST(Mean, Wtd_Dim1Matrix)
+{
+    meanDimTest<TypeParam>(string(TEST_DIR "/mean/wtd_mean_dim1_mat.test"), 1, true);
+}
 
 template<typename T>
-void testCPPMean(T const_value, af::dim4 dims)
+void meanAllTest(T const_value, af::dim4 dims)
 {
     typedef typename meanOutType<T>::type outType;
     if (noDoubleTests<T>()) return;
@@ -168,42 +198,112 @@ void testCPPMean(T const_value, af::dim4 dims)
     ASSERT_NEAR(::imag(output), ::imag(gold), 1.0e-3);
 }
 
-TEST(Mean, CPP_f64)
+TEST(MeanAll, f64)
 {
-    testCPPMean<double>(2.1, af::dim4(10, 10, 1, 1));
+    meanAllTest<double>(2.1, af::dim4(10, 10, 1, 1));
 }
 
-TEST(Mean, CPP_f32)
+TEST(MeanAll, f32)
 {
-    testCPPMean<float>(2.1f, af::dim4(10, 5, 2, 1));
+    meanAllTest<float>(2.1f, af::dim4(10, 5, 2, 1));
 }
 
-TEST(Mean, CPP_s32)
+TEST(MeanAll, s32)
 {
-    testCPPMean<int>(2, af::dim4(5, 5, 2, 2));
+    meanAllTest<int>(2, af::dim4(5, 5, 2, 2));
 }
 
-TEST(Mean, CPP_u32)
+TEST(MeanAll, u32)
 {
-    testCPPMean<unsigned>(2, af::dim4(100, 1, 1, 1));
+    meanAllTest<unsigned>(2, af::dim4(100, 1, 1, 1));
 }
 
-TEST(Mean, CPP_s8)
+TEST(MeanAll, s8)
 {
-    testCPPMean<char>(2, af::dim4(5, 5, 2, 2));
+    meanAllTest<char>(2, af::dim4(5, 5, 2, 2));
 }
 
-TEST(Mean, CPP_u8)
+TEST(MeanAll, u8)
 {
-    testCPPMean<uchar>(2, af::dim4(100, 1, 1, 1));
+    meanAllTest<uchar>(2, af::dim4(100, 1, 1, 1));
 }
 
-TEST(Mean, CPP_cfloat)
+TEST(MeanAll, c32)
 {
-    testCPPMean<cfloat>(cfloat(2.1f), af::dim4(10, 5, 2, 1));
+    meanAllTest<cfloat>(cfloat(2.1f), af::dim4(10, 5, 2, 1));
+}
+
+TEST(MeanAll, s16)
+{
+    meanAllTest<short>(2, af::dim4(5, 5, 2, 2));
+}
+
+TEST(MeanAll, u16)
+{
+    meanAllTest<ushort>(2, af::dim4(100, 1, 1, 1));
+}
+
+TEST(MeanAll, c64)
+{
+    meanAllTest<cdouble>(cdouble(2.1), af::dim4(10, 10, 1, 1));
+}
+
+
+template<typename T>
+T random() { return T(std::rand()%10); }
+
+template<> cfloat random<cfloat>() { return cfloat(float(std::rand()%10), float(std::rand()%10)); }
+
+template<> cdouble random<cdouble>() { return cdouble(double(std::rand()%10), double(std::rand()%10)); }
+
+template<typename T>
+class WeightedMean : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {}
+};
+
+// register the type list
+TYPED_TEST_CASE(WeightedMean, TestTypes);
+
+template<typename T, typename wtsType>
+void weightedMeanAllTest(af::dim4 dims)
+{
+    typedef typename meanOutType<T>::type outType;
+
+    if (noDoubleTests<T>()) return;
+    if (noDoubleTests<outType>()) return;
+    if (noDoubleTests<wtsType>()) return;
+
+    using af::array;
+    using af::mean;
+
+    std::srand(std::time(0));
+
+    vector<T> data(dims.elements());
+    vector<wtsType> wts(dims.elements());
+    std::generate(data.begin(), data.end(), random<T>);
+    std::generate(wts.begin(), wts.end(), random<wtsType>);
+
+    outType wtdSum = outType(0);
+    wtsType wtsSum = wtsType(0);
+
+    for(int i = 0; i < (int)data.size(); i++) {
+        wtdSum = wtdSum + data[i]*wts[i];
+        wtsSum = wtsSum + wts[i];
+    }
+
+    outType gold = wtdSum / wtsSum;
+
+    array a(dims, &(data.front()));
+    array w(dims, &(wts.front()));
+    outType output = mean<outType>(a, w);
+
+    ASSERT_NEAR(::real(output), ::real(gold), 1.0e-2);
+    ASSERT_NEAR(::imag(output), ::imag(gold), 1.0e-2);
 }
 
-TEST(Mean, CPP_cdouble)
+TYPED_TEST(WeightedMean, Basic)
 {
-    testCPPMean<cdouble>(cdouble(2.1), af::dim4(10, 10, 1, 1));
+    weightedMeanAllTest<TypeParam, float>(af::dim4(32, 30, 33, 17));
 }
diff --git a/test/meanshift.cpp b/test/meanshift.cpp
index 5f1f9a4e3c..7363350e80 100644
--- a/test/meanshift.cpp
+++ b/test/meanshift.cpp
@@ -27,7 +27,7 @@ class Meanshift : public ::testing::Test
         virtual void SetUp() {}
 };
 
-typedef ::testing::Types<float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, short, ushort, intl, uintl> TestTypes;
 
 TYPED_TEST_CASE(Meanshift, TestTypes);
 
diff --git a/test/medfilt.cpp b/test/medfilt.cpp
index db00d94e51..99dd0b6757 100644
--- a/test/medfilt.cpp
+++ b/test/medfilt.cpp
@@ -26,7 +26,7 @@ class MedianFilter : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(MedianFilter, TestTypes);
diff --git a/test/median.cpp b/test/median.cpp
index fcd0393ca7..e0b21ba281 100644
--- a/test/median.cpp
+++ b/test/median.cpp
@@ -127,4 +127,6 @@ MEDIAN0(float, float)
 MEDIAN0(float, int)
 MEDIAN0(float, uint)
 MEDIAN0(float, uchar)
+MEDIAN0(float, short)
+MEDIAN0(float, ushort)
 MEDIAN0(double, double)
diff --git a/test/missing.cpp b/test/missing.cpp
index ff318ac5bf..c06fdf8220 100644
--- a/test/missing.cpp
+++ b/test/missing.cpp
@@ -19,7 +19,6 @@ TEST(MissingFunctionTests, Dummy)
 {
     array A = randu(10,10, f32);
     af_print(A);
-    af_print(rank(A));
     af_print(arg(A));
     af_print(arg(complex(A, A)));
     af_print(trunc(3 * A));
@@ -31,5 +30,4 @@ TEST(MissingFunctionTests, Dummy)
     af_print(minfilt(A, 3, 3) - erode(A, constant(1, 3,3)));
     af_print(maxfilt(A, 3, 3) - dilate(A, constant(1, 3,3)));
     printf("%lf\n", norm(A));
-    printf("%lf\n", det<double>(A));
 }
diff --git a/test/moddims.cpp b/test/moddims.cpp
index 5fe751bbc0..053948dbe2 100644
--- a/test/moddims.cpp
+++ b/test/moddims.cpp
@@ -36,7 +36,7 @@ class Moddims : public ::testing::Test
 
 // create a list of types to be tested
 // TODO: complex types tests have to be added
-typedef ::testing::Types<float, double, int, unsigned, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, int, unsigned, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Moddims, TestTypes);
diff --git a/test/morph.cpp b/test/morph.cpp
index 04de84f8a9..d73ca9b50d 100644
--- a/test/morph.cpp
+++ b/test/morph.cpp
@@ -27,7 +27,7 @@ class Morph : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Morph, TestTypes);
diff --git a/test/nearest_neighbour.cpp b/test/nearest_neighbour.cpp
index 3ca166b1ab..2bca086f11 100644
--- a/test/nearest_neighbour.cpp
+++ b/test/nearest_neighbour.cpp
@@ -28,7 +28,7 @@ class NearestNeighbour : public ::testing::Test
 };
 
 // create lists of types to be tested
-typedef ::testing::Types<float, double, int, uint, intl, uintl, uchar> TestTypes;
+typedef ::testing::Types<float, double, int, uint, intl, uintl, uchar, short, ushort> TestTypes;
 
 template<typename T>
 struct otype_t
@@ -36,6 +36,18 @@ struct otype_t
     typedef T otype;
 };
 
+template<>
+struct otype_t<short>
+{
+    typedef int otype;
+};
+
+template<>
+struct otype_t<ushort>
+{
+    typedef uint otype;
+};
+
 template<>
 struct otype_t<uchar>
 {
diff --git a/test/random.cpp b/test/random.cpp
index 4ca5126b2a..29f157a776 100644
--- a/test/random.cpp
+++ b/test/random.cpp
@@ -178,7 +178,7 @@ void testSetSeed(const uintl seed0, const uintl seed1, bool is_norm = false)
 
     for (int i = 0; i < num; i++) {
         // Verify if same seed produces same arrays
-        ASSERT_EQ(h_in0[i], h_in2[i]);
+        ASSERT_EQ(h_in0[i], h_in2[i]) << "at : " << i;
 
         // Verify different arrays created with different seeds differ
         // b8 and u9 can clash because they generate a small set of values
diff --git a/test/range.cpp b/test/range.cpp
index 6d7d9b7bc7..be4c22b8fd 100644
--- a/test/range.cpp
+++ b/test/range.cpp
@@ -38,7 +38,7 @@ class Range : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, unsigned int, intl, uintl, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, int, unsigned int, intl, uintl, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Range, TestTypes);
diff --git a/test/rank_dense.cpp b/test/rank_dense.cpp
index 3ecf49784c..96b44497f1 100644
--- a/test/rank_dense.cpp
+++ b/test/rank_dense.cpp
@@ -30,8 +30,14 @@ class Rank : public ::testing::Test
 {
 };
 
+template<typename T>
+class Det : public ::testing::Test
+{
+};
+
 typedef ::testing::Types<float, double, af::cfloat, af::cdouble> TestTypes;
 TYPED_TEST_CASE(Rank, TestTypes);
+TYPED_TEST_CASE(Det, TestTypes);
 
 template<typename T>
 void rankSmall()
@@ -86,3 +92,27 @@ TYPED_TEST(Rank, low)
 {
     rankBig<TypeParam>(512);
 }
+
+template<typename T>
+void detTest()
+{
+    if (noDoubleTests<T>()) return;
+    af::dtype dt = (af::dtype)af::dtype_traits<T>::af_type;
+
+    vector<af::dim4> numDims;
+
+    vector<vector<float> >   in;
+    vector<vector<float> >   tests;
+    readTests<float,float,float>(string(TEST_DIR"/lapack/detSmall.test"),numDims,in,tests);
+    af::dim4 dims       = numDims[0];
+
+    af::array input = af::array(dims, &(in[0].front())).as(dt);
+    T output = af::det<T>(input);
+
+    ASSERT_NEAR(abs((T)tests[0][0]), abs(output), 1e-6);
+}
+
+TYPED_TEST(Det, Small)
+{
+    detTest<TypeParam>();
+}
diff --git a/test/reduce.cpp b/test/reduce.cpp
index eef711db94..f71dc76b80 100644
--- a/test/reduce.cpp
+++ b/test/reduce.cpp
@@ -31,7 +31,7 @@ class Reduce : public ::testing::Test
 {
 };
 
-typedef ::testing::Types<float, double, af::cfloat, af::cdouble, uint, int, intl, uintl, uchar> TestTypes;
+typedef ::testing::Types<float, double, af::cfloat, af::cdouble, uint, int, intl, uintl, uchar, short, ushort> TestTypes;
 TYPED_TEST_CASE(Reduce, TestTypes);
 
 typedef af_err (*reduceFunc)(af_array *, const af_array, const int);
@@ -125,10 +125,14 @@ struct promote_type {
 };
 
 // char and uchar are promoted to int for sum and product
-template<> struct promote_type<uchar, af_sum>       { typedef uint type; };
-template<> struct promote_type<char , af_sum>       { typedef uint type; };
-template<> struct promote_type<uchar, af_product>   { typedef uint type; };
-template<> struct promote_type<char , af_product>   { typedef uint type; };
+template<> struct promote_type<uchar , af_sum>       { typedef uint type; };
+template<> struct promote_type<char  , af_sum>       { typedef uint type; };
+template<> struct promote_type<short , af_sum>       { typedef int  type; };
+template<> struct promote_type<ushort, af_sum>       { typedef uint type; };
+template<> struct promote_type<uchar , af_product>   { typedef uint type; };
+template<> struct promote_type<char  , af_product>   { typedef uint type; };
+template<> struct promote_type<short, af_product>    { typedef int  type; };
+template<> struct promote_type<ushort, af_product>   { typedef uint type; };
 
 #define REDUCE_TESTS(FN)                                                                    \
     TYPED_TEST(Reduce,Test_##FN)                                                    \
diff --git a/test/regions.cpp b/test/regions.cpp
index 273f336463..fccb902f46 100644
--- a/test/regions.cpp
+++ b/test/regions.cpp
@@ -33,7 +33,7 @@ class Regions : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, unsigned> TestTypes;
+typedef ::testing::Types<float, double, int, unsigned, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Regions, TestTypes);
diff --git a/test/reorder.cpp b/test/reorder.cpp
index 789fbfbbc8..4b57170c42 100644
--- a/test/reorder.cpp
+++ b/test/reorder.cpp
@@ -38,7 +38,7 @@ class Reorder : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Reorder, TestTypes);
diff --git a/test/replace.cpp b/test/replace.cpp
index 34316b3a99..c6d3b5d042 100644
--- a/test/replace.cpp
+++ b/test/replace.cpp
@@ -24,7 +24,7 @@ class Replace : public ::testing::Test
 {
 };
 
-typedef ::testing::Types<float, double, af::cfloat, af::cdouble, uint, int, intl, uintl, uchar, char> TestTypes;
+typedef ::testing::Types<float, double, af::cfloat, af::cdouble, uint, int, intl, uintl, uchar, char, short, ushort> TestTypes;
 
 TYPED_TEST_CASE(Replace, TestTypes);
 
diff --git a/test/resize.cpp b/test/resize.cpp
index 0be2af434b..6ec4e553c6 100644
--- a/test/resize.cpp
+++ b/test/resize.cpp
@@ -54,7 +54,7 @@ class ResizeI : public ::testing::Test
 
 // create a list of types to be tested
 typedef ::testing::Types<float, double, cfloat, cdouble> TestTypesF;
-typedef ::testing::Types<int, unsigned, intl, uintl, unsigned char, char> TestTypesI;
+typedef ::testing::Types<int, unsigned, intl, uintl, unsigned char, char, short, ushort> TestTypesI;
 
 // register the type list
 TYPED_TEST_CASE(Resize, TestTypesF);
diff --git a/test/rotate.cpp b/test/rotate.cpp
index 00a234f4ce..f97cd3ab96 100644
--- a/test/rotate.cpp
+++ b/test/rotate.cpp
@@ -32,7 +32,7 @@ class Rotate : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, cfloat, cdouble, int, intl, char> TestTypes;
+typedef ::testing::Types<float, double, cfloat, cdouble, int, intl, char, short> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Rotate, TestTypes);
diff --git a/test/rotate_linear.cpp b/test/rotate_linear.cpp
index 06a643346a..29a9107e4c 100644
--- a/test/rotate_linear.cpp
+++ b/test/rotate_linear.cpp
@@ -36,7 +36,7 @@ class Rotate : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, cfloat, cdouble, int, intl, char> TestTypes;
+typedef ::testing::Types<float, double, cfloat, cdouble, int, intl, char, short> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Rotate, TestTypes);
diff --git a/test/sat.cpp b/test/sat.cpp
index 00261e29f8..4cfb582e71 100644
--- a/test/sat.cpp
+++ b/test/sat.cpp
@@ -26,7 +26,7 @@ class SAT : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, int, uint, char, uchar, uintl, intl> TestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, uintl, intl, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(SAT, TestTypes);
diff --git a/test/scan.cpp b/test/scan.cpp
index 88ee8b4f45..386568d402 100644
--- a/test/scan.cpp
+++ b/test/scan.cpp
@@ -108,7 +108,9 @@ SCAN_TESTS(accum, cdouble , cdouble   , cdouble   );
 SCAN_TESTS(accum, unsigned, unsigned  , unsigned  );
 SCAN_TESTS(accum, intl    , intl      , intl      );
 SCAN_TESTS(accum, uintl   , uintl     , uintl     );
-SCAN_TESTS(accum, uchar   , unsigned char, unsigned);
+SCAN_TESTS(accum, uchar   , uchar     , unsigned  );
+SCAN_TESTS(accum, short   , short     , int       );
+SCAN_TESTS(accum, ushort  , ushort    , uint      );
 
 TEST(Scan,Test_Scan_Big0)
 {
diff --git a/test/select.cpp b/test/select.cpp
index bc3e1f04ba..91c8110bc6 100644
--- a/test/select.cpp
+++ b/test/select.cpp
@@ -24,7 +24,7 @@ class Select : public ::testing::Test
 {
 };
 
-typedef ::testing::Types<float, double, af::cfloat, af::cdouble, uint, int, intl, uintl, uchar, char> TestTypes;
+typedef ::testing::Types<float, double, af::cfloat, af::cdouble, uint, int, intl, uintl, uchar, char, short, ushort> TestTypes;
 TYPED_TEST_CASE(Select, TestTypes);
 
 template<typename T>
diff --git a/test/set.cpp b/test/set.cpp
index e879d2472a..a6d04ed45e 100644
--- a/test/set.cpp
+++ b/test/set.cpp
@@ -85,6 +85,10 @@ UNIQUE_TESTS(double)
 UNIQUE_TESTS(int)
 UNIQUE_TESTS(uint)
 UNIQUE_TESTS(uchar)
+UNIQUE_TESTS(short)
+UNIQUE_TESTS(ushort)
+UNIQUE_TESTS(intl)
+UNIQUE_TESTS(uintl)
 
 typedef af_err (*setFunc)(af_array *, const af_array, const af_array, const bool);
 
@@ -161,3 +165,7 @@ SET_TESTS(double)
 SET_TESTS(int)
 SET_TESTS(uint)
 SET_TESTS(uchar)
+SET_TESTS(short)
+SET_TESTS(ushort)
+SET_TESTS(intl)
+SET_TESTS(uintl)
diff --git a/test/shift.cpp b/test/shift.cpp
index a3cf35d679..74f418c5c1 100644
--- a/test/shift.cpp
+++ b/test/shift.cpp
@@ -38,7 +38,7 @@ class Shift : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char, short, ushort> TestTypes;
 // register the type list
 TYPED_TEST_CASE(Shift, TestTypes);
 
diff --git a/test/sift_nonfree.cpp b/test/sift_nonfree.cpp
index 67699cbb05..28c597ca38 100644
--- a/test/sift_nonfree.cpp
+++ b/test/sift_nonfree.cpp
@@ -234,11 +234,7 @@ void siftTest(string pTestFile, unsigned nLayers, float contrastThr, float edgeT
             ASSERT_LE(fabs(out_feat[elIter].f[4] - gold_feat[elIter].f[4]), 1e-3) << "at: " << elIter << std::endl;
         }
 
-        bool isTypeDouble = is_same_type<T, double>::value || is_same_type<T, af::cdouble>::value;
-        if (isTypeDouble)
-            EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 2.f, 4.5f));
-        else
-            EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 1.f, 4.f));
+        EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 2.f, 4.5f));
 
         ASSERT_EQ(AF_SUCCESS, af_release_array(inArray));
         ASSERT_EQ(AF_SUCCESS, af_release_array(inArray_f32));
@@ -334,7 +330,7 @@ TEST(SIFT, CPP)
         ASSERT_LE(fabs(out_feat[elIter].f[4] - gold_feat[elIter].f[4]), 1e-3) << "at: " << elIter << std::endl;
     }
 
-    EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 1.f, 2.f));
+    EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 2.f, 4.5f));
 
     delete[] outX;
     delete[] outY;
diff --git a/test/sobel.cpp b/test/sobel.cpp
index 2ec5ab01c2..d3f4528af0 100644
--- a/test/sobel.cpp
+++ b/test/sobel.cpp
@@ -34,7 +34,7 @@ class Sobel_Integer : public ::testing::Test
 
 // create a list of types to be tested
 typedef ::testing::Types<float, double> TestTypes;
-typedef ::testing::Types<int, unsigned, char, unsigned char> TestTypesInt;
+typedef ::testing::Types<int, unsigned, char, unsigned char, short, ushort> TestTypesInt;
 
 // register the type list
 TYPED_TEST_CASE(Sobel, TestTypes);
diff --git a/test/solve_dense.cpp b/test/solve_dense.cpp
index 8f2657098c..d78ceb9d33 100644
--- a/test/solve_dense.cpp
+++ b/test/solve_dense.cpp
@@ -31,6 +31,8 @@ using af::cdouble;
 template<typename T>
 void solveTester(const int m, const int n, const int k, double eps)
 {
+    af::deviceGC();
+
     if (noDoubleTests<T>()) return;
 #if 1
     af::array A  = cpu_randu<T>(af::dim4(m, n));
@@ -56,6 +58,8 @@ void solveTester(const int m, const int n, const int k, double eps)
 template<typename T>
 void solveLUTester(const int n, const int k, double eps)
 {
+    af::deviceGC();
+
     if (noDoubleTests<T>()) return;
 #if 1
     af::array A  = cpu_randu<T>(af::dim4(n, n));
@@ -81,6 +85,8 @@ void solveLUTester(const int n, const int k, double eps)
 template<typename T>
 void solveTriangleTester(const int n, const int k, bool is_upper, double eps)
 {
+    af::deviceGC();
+
     if (noDoubleTests<T>()) return;
 #if 1
     af::array A  = cpu_randu<T>(af::dim4(n, n));
diff --git a/test/sort.cpp b/test/sort.cpp
index 7377d2a9f8..7ec6f5565e 100644
--- a/test/sort.cpp
+++ b/test/sort.cpp
@@ -38,7 +38,7 @@ class Sort : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, uint, int, uchar> TestTypes;
+typedef ::testing::Types<float, double, uint, int, uchar, short, ushort, intl, uintl> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Sort, TestTypes);
diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp
index 4f817aad9d..3d82b9fd90 100644
--- a/test/sort_by_key.cpp
+++ b/test/sort_by_key.cpp
@@ -38,7 +38,7 @@ class Sort : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, uint, int, uchar> TestTypes;
+typedef ::testing::Types<float, double, uint, int, uchar, short, ushort, intl, uintl> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Sort, TestTypes);
@@ -115,9 +115,10 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     SORT_INIT(Sort10x10True,  sort_by_key_2D,    true,  0, 1);
     SORT_INIT(Sort10x10False, sort_by_key_2D,    false, 2, 3);
     SORT_INIT(Sort1000True,   sort_by_key_1000,  true,  0, 1);
-    SORT_INIT(Sort1000False,  sort_by_key_1000,  false, 2, 3);
     SORT_INIT(SortMedTrue,    sort_by_key_med,   true,  0, 1);
-    SORT_INIT(SortMedFalse,   sort_by_key_med,   false, 2, 3);
+    // FIXME: below two tests are disabled temporarily until issue#995 is fixed
+    //SORT_INIT(Sort1000False,  sort_by_key_1000,  false, 2, 3);
+    //SORT_INIT(SortMedFalse,   sort_by_key_med,   false, 2, 3);
     // Takes too much time in current implementation. Enable when everything is parallel
     //SORT_INIT(SortLargeTrue,  sort_by_key_large, true,  0, 1);
     //SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3);
diff --git a/test/sort_index.cpp b/test/sort_index.cpp
index f4296266fd..0711e8b494 100644
--- a/test/sort_index.cpp
+++ b/test/sort_index.cpp
@@ -38,7 +38,7 @@ class Sort : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, uint, int, uchar> TestTypes;
+typedef ::testing::Types<float, double, uint, int, uchar, short, ushort, intl, uintl> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Sort, TestTypes);
@@ -116,9 +116,10 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     SORT_INIT(Sort10x10True,  sort_10x10, true,  0, 1);
     SORT_INIT(Sort10x10False, sort_10x10, false, 2, 3);
     SORT_INIT(Sort1000True,   sort_1000,  true,  0, 1);
-    SORT_INIT(Sort1000False,  sort_1000,  false, 2, 3);
     SORT_INIT(SortMedTrue,    sort_med1,  true,  0, 1);
-    SORT_INIT(SortMedFalse,   sort_med1,  false, 2, 3);
+    // FIXME: below two tests are disabled temporarily until issue#995 is fixed
+    //SORT_INIT(Sort1000False,  sort_1000,  false, 2, 3);
+    //SORT_INIT(SortMedFalse,   sort_med1,  false, 2, 3);
     // Takes too much time in current implementation. Enable when everything is parallel
     //SORT_INIT(SortMed5True,   sort_med,   true,  0, 1);
     //SORT_INIT(SortMed5False,  sort_med,   false, 2, 3);
diff --git a/test/stdev.cpp b/test/stdev.cpp
new file mode 100644
index 0000000000..f33d4e38fa
--- /dev/null
+++ b/test/stdev.cpp
@@ -0,0 +1,207 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <string>
+#include <vector>
+#include <ctime>
+#include <iostream>
+#include <algorithm>
+#include <testHelpers.hpp>
+
+using namespace std;
+using namespace af;
+
+template<typename T>
+class StandardDev : public ::testing::Test
+{
+    public:
+        virtual void SetUp() {}
+};
+
+// create a list of types to be tested
+typedef ::testing::Types<float, double, int, uint, intl, uintl, char, uchar> TestTypes;
+
+// register the type list
+TYPED_TEST_CASE(StandardDev, TestTypes);
+
+template<typename T>
+struct f32HelperType {
+   typedef typename cond_type<is_same_type<T, double>::value,
+                                             double,
+                                             float>::type type;
+};
+
+template<typename T>
+struct c32HelperType {
+   typedef typename cond_type<is_same_type<T, cfloat>::value,
+                                             cfloat,
+                                             typename f32HelperType<T>::type >::type type;
+};
+
+template<typename T>
+struct elseType {
+   typedef typename cond_type< is_same_type<T, uintl>::value ||
+                               is_same_type<T, intl> ::value,
+                                              double,
+                                              T>::type type;
+};
+
+template<typename T>
+struct sdOutType {
+   typedef typename cond_type< is_same_type<T, float>   ::value ||
+                               is_same_type<T, int>     ::value ||
+                               is_same_type<T, uint>    ::value ||
+                               is_same_type<T, uchar>   ::value ||
+                               is_same_type<T, short>   ::value ||
+                               is_same_type<T, ushort>  ::value ||
+                               is_same_type<T, char>    ::value,
+                                              float,
+                              typename elseType<T>::type>::type type;
+};
+
+template<typename T>
+void stdevDimTest(string pFileName, dim_t dim=-1)
+{
+    typedef typename sdOutType<T>::type outType;
+    if (noDoubleTests<T>()) return;
+    if (noDoubleTests<outType>()) return;
+
+    vector<af::dim4>      numDims;
+    vector<vector<int> >       in;
+    vector<vector<float> >  tests;
+
+    readTestsFromFile<int,float>(pFileName, numDims, in, tests);
+
+    af::dim4 dims = numDims[0];
+    vector<T> input(in[0].begin(), in[0].end());
+
+    af::array a(dims, &(input.front()));
+
+    af::array b = stdev(a, dim);
+
+    vector<outType> currGoldBar(tests[0].begin(), tests[0].end());
+
+    size_t nElems    = currGoldBar.size();
+    outType *outData = new outType[nElems];
+
+    b.host((void*)outData);
+
+    for (size_t elIter=0; elIter<nElems; ++elIter) {
+        ASSERT_NEAR(::real(currGoldBar[elIter]), ::real(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+        ASSERT_NEAR(::imag(currGoldBar[elIter]), ::imag(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+    }
+
+    // cleanup
+    delete[] outData;
+}
+
+TYPED_TEST(StandardDev, Dim0)
+{
+    stdevDimTest<TypeParam>(string(TEST_DIR "/stdev/mat_10x10_dim0.test"), 0);
+}
+
+TYPED_TEST(StandardDev, Dim1)
+{
+    stdevDimTest<TypeParam>(string(TEST_DIR "/stdev/mat_10x10_dim1.test"), 1);
+}
+
+TYPED_TEST(StandardDev, Dim2)
+{
+    stdevDimTest<TypeParam>(string(TEST_DIR "/stdev/hypercube_10x10x5x5_dim2.test"), 2);
+}
+
+TYPED_TEST(StandardDev, Dim3)
+{
+    stdevDimTest<TypeParam>(string(TEST_DIR "/stdev/hypercube_10x10x5x5_dim3.test"), 3);
+}
+
+TEST(StandardDev, InvalidDim)
+{
+    ASSERT_THROW(af::stdev(af::array(), 5), af::exception);
+}
+
+TEST(StandardDev, InvalidType)
+{
+    ASSERT_THROW(af::stdev(constant(cdouble(1.0, -1.0), 10)), af::exception);
+}
+
+template<typename T>
+void stdevDimIndexTest(string pFileName, dim_t dim=-1)
+{
+    typedef typename sdOutType<T>::type outType;
+    if (noDoubleTests<T>()) return;
+    if (noDoubleTests<outType>()) return;
+
+    vector<af::dim4>      numDims;
+    vector<vector<int> >       in;
+    vector<vector<float> >  tests;
+
+    readTestsFromFile<int,float>(pFileName, numDims, in, tests);
+
+    af::dim4 dims = numDims[0];
+    vector<T> input(in[0].begin(), in[0].end());
+
+    af::array a(dims, &(input.front()));
+    af::array b = a(seq(2,6), seq(1,7));
+
+    af::array c = stdev(b, dim);
+
+    vector<outType> currGoldBar(tests[0].begin(), tests[0].end());
+
+    size_t nElems    = currGoldBar.size();
+    outType *outData = new outType[nElems];
+
+    c.host((void*)outData);
+
+    for (size_t elIter=0; elIter<nElems; ++elIter) {
+        ASSERT_NEAR(::real(currGoldBar[elIter]), ::real(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+        ASSERT_NEAR(::imag(currGoldBar[elIter]), ::imag(outData[elIter]), 1.0e-3)<< "at: " << elIter<< std::endl;
+    }
+
+    // cleanup
+    delete[] outData;
+}
+
+TYPED_TEST(StandardDev, IndexedArrayDim0)
+{
+    stdevDimIndexTest<TypeParam>(string(TEST_DIR "/stdev/mat_10x10_seq2_6x1_7_dim0.test"), 0);
+}
+
+TYPED_TEST(StandardDev, IndexedArrayDim1)
+{
+    stdevDimIndexTest<TypeParam>(string(TEST_DIR "/stdev/mat_10x10_seq2_6x1_7_dim1.test"), 1);
+}
+
+TYPED_TEST(StandardDev, All)
+{
+    typedef typename sdOutType<TypeParam>::type outType;
+    if (noDoubleTests<TypeParam>()) return;
+    if (noDoubleTests<outType>()) return;
+
+    vector<af::dim4>      numDims;
+    vector<vector<int> >       in;
+    vector<vector<float> >  tests;
+
+    readTestsFromFile<int,float>(string(TEST_DIR "/stdev/mat_10x10_scalar.test"),
+                                 numDims, in, tests);
+
+    af::dim4 dims = numDims[0];
+    vector<TypeParam> input(in[0].begin(), in[0].end());
+
+    af::array a(dims, &(input.front()));
+    outType b = stdev<outType>(a);
+
+    vector<outType> currGoldBar(tests[0].begin(), tests[0].end());
+    ASSERT_NEAR(::real(currGoldBar[0]), ::real(b), 1.0e-3);
+    ASSERT_NEAR(::imag(currGoldBar[0]), ::imag(b), 1.0e-3);
+}
diff --git a/test/susan.cpp b/test/susan.cpp
index 4e6995350c..01ed2288f2 100644
--- a/test/susan.cpp
+++ b/test/susan.cpp
@@ -55,7 +55,7 @@ class Susan : public ::testing::Test
         virtual void SetUp() {}
 };
 
-typedef ::testing::Types<float, double, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, double, int, uint, char, uchar, short, ushort> TestTypes;
 
 TYPED_TEST_CASE(Susan, TestTypes);
 
diff --git a/test/testHelpers.hpp b/test/testHelpers.hpp
index 09e1dc2969..e982e9005d 100644
--- a/test/testHelpers.hpp
+++ b/test/testHelpers.hpp
@@ -17,8 +17,9 @@
 #include <af/dim4.hpp>
 #include <af/array.h>
 
-typedef unsigned char uchar;
-typedef unsigned int uint;
+typedef unsigned char  uchar;
+typedef unsigned int   uint;
+typedef unsigned short ushort;
 
 template<typename inType, typename outType, typename FileElementType>
 void readTests(const std::string &FileName, std::vector<af::dim4> &inputDims,
@@ -361,42 +362,18 @@ struct cond_type<false, T, Other> {
 };
 
 template<typename T>
-double real(T val) { return real(val); }
+double real(T val) { return (double)val; }
 template<>
-double real<double>(double val) { return val; }
+double real<af::cdouble>(af::cdouble val) { return real(val); }
 template<>
-double real<float>(float val) { return val; }
-template<>
-double real<int>(int val) { return val; }
-template<>
-double real<char>(char val) { return val; }
-template<>
-double real<uchar>(uchar val) { return val; }
-template<>
-double real<uint>(uint val) { return val; }
-template<>
-double real<intl>(intl val) { return val; }
-template<>
-double real<uintl>(uintl val) { return val; }
+double real<af::cfloat> (af::cfloat val) { return real(val); }
 
 template<typename T>
-double imag(T val) { return imag(val); }
-template<>
-double imag<double>(double val) { return 0; }
-template<>
-double imag<float>(float val) { return 0; }
-template<>
-double imag<int>(int val) { return 0; }
-template<>
-double imag<uint>(uint val) { return 0; }
-template<>
-double imag<intl>(intl val) { return 0; }
-template<>
-double imag<uintl>(uintl val) { return 0; }
+double imag(T val) { return (double)val; }
 template<>
-double imag<char>(char val) { return 0; }
+double imag<af::cdouble>(af::cdouble val) { return imag(val); }
 template<>
-double imag<uchar>(uchar val) { return 0; }
+double imag<af::cfloat> (af::cfloat val) { return imag(val); }
 
 template<typename T>
 bool noDoubleTests()
diff --git a/test/tile.cpp b/test/tile.cpp
index adeda5b4e4..964b77f0b2 100644
--- a/test/tile.cpp
+++ b/test/tile.cpp
@@ -38,7 +38,7 @@ class Tile : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Tile, TestTypes);
diff --git a/test/translate.cpp b/test/translate.cpp
index cd2df331bb..5b00c04ec8 100644
--- a/test/translate.cpp
+++ b/test/translate.cpp
@@ -41,7 +41,7 @@ class TranslateInt : public ::testing::Test
 
 // create a list of types to be tested
 typedef ::testing::Types<float, double, cfloat, cdouble> TestTypes;
-typedef ::testing::Types<int, intl, char> TestTypesInt;
+typedef ::testing::Types<int, intl, char, short> TestTypesInt;
 
 // register the type list
 TYPED_TEST_CASE(Translate, TestTypes);
diff --git a/test/transpose.cpp b/test/transpose.cpp
index 1e4ee473be..6be1ba49ab 100644
--- a/test/transpose.cpp
+++ b/test/transpose.cpp
@@ -37,7 +37,7 @@ class Transpose : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, cfloat, double, cdouble, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, cfloat, double, cdouble, int, uint, char, uchar, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Transpose, TestTypes);
diff --git a/test/transpose_inplace.cpp b/test/transpose_inplace.cpp
index 34e17647c8..a54ff75d34 100644
--- a/test/transpose_inplace.cpp
+++ b/test/transpose_inplace.cpp
@@ -29,7 +29,7 @@ class Transpose : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, cfloat, double, cdouble, int, uint, char, uchar> TestTypes;
+typedef ::testing::Types<float, cfloat, double, cdouble, int, uint, char, uchar, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Transpose, TestTypes);
diff --git a/test/triangle.cpp b/test/triangle.cpp
index d3bed920cc..e0b609b9ab 100644
--- a/test/triangle.cpp
+++ b/test/triangle.cpp
@@ -30,7 +30,7 @@ using af::dim4;
 template<typename T>
 class Triangle : public ::testing::Test { };
 
-typedef ::testing::Types<float, af::cfloat, double, af::cdouble, int, unsigned, char, uchar, uintl, intl> TestTypes;
+typedef ::testing::Types<float, af::cfloat, double, af::cdouble, int, unsigned, char, uchar, uintl, intl, short, ushort> TestTypes;
 TYPED_TEST_CASE(Triangle, TestTypes);
 
 template<typename T>
diff --git a/test/unwrap.cpp b/test/unwrap.cpp
index 28ec1c060d..82371d31fb 100644
--- a/test/unwrap.cpp
+++ b/test/unwrap.cpp
@@ -34,7 +34,7 @@ class Unwrap : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Unwrap, TestTypes);
diff --git a/test/var.cpp b/test/var.cpp
index fcea0ab02f..2311130f65 100644
--- a/test/var.cpp
+++ b/test/var.cpp
@@ -27,24 +27,26 @@ class Var : public ::testing::Test
 
 };
 
-typedef ::testing::Types< float, double, cfloat, cdouble, uint, int, uintl, intl, char, uchar> TestTypes;
+typedef ::testing::Types< float, double, cfloat, cdouble, uint, int, uintl, intl, char, uchar, short, ushort> TestTypes;
 TYPED_TEST_CASE(Var, TestTypes);
 
 template<typename T>
 struct elseType {
    typedef typename cond_type< is_same_type<T, uintl>::value ||
-                               is_same_type<T, intl>::value,
+                               is_same_type<T, intl> ::value,
                                               double,
                                               T>::type type;
 };
 
 template<typename T>
 struct varOutType {
-   typedef typename cond_type< is_same_type<T, float>::value ||
-                               is_same_type<T, int>::value ||
-                               is_same_type<T, uint>::value ||
-                               is_same_type<T, uchar>::value ||
-                               is_same_type<T, char>::value,
+   typedef typename cond_type< is_same_type<T, float >::value ||
+                               is_same_type<T, int   >::value ||
+                               is_same_type<T, uint  >::value ||
+                               is_same_type<T, short >::value ||
+                               is_same_type<T, ushort>::value ||
+                               is_same_type<T, uchar >::value ||
+                               is_same_type<T, char  >::value,
                                               float,
                               typename elseType<T>::type>::type type;
 };
diff --git a/test/where.cpp b/test/where.cpp
index 96bc8d50de..eb21e0d6dc 100644
--- a/test/where.cpp
+++ b/test/where.cpp
@@ -27,7 +27,7 @@ using af::cdouble;
 template<typename T>
 class Where : public ::testing::Test { };
 
-typedef ::testing::Types< float, double, cfloat, cdouble, int, uint, intl, uintl, char, uchar > TestTypes;
+typedef ::testing::Types< float, double, cfloat, cdouble, int, uint, intl, uintl, char, uchar, short, ushort> TestTypes;
 TYPED_TEST_CASE(Where, TestTypes);
 
 template<typename T>
diff --git a/test/wrap.cpp b/test/wrap.cpp
index 0a9cdc20e7..0cc6fab909 100644
--- a/test/wrap.cpp
+++ b/test/wrap.cpp
@@ -35,7 +35,7 @@ class Wrap : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, double, cfloat, cdouble, int, unsigned int, intl, uintl, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Wrap, TestTypes);
diff --git a/test/write.cpp b/test/write.cpp
index afe5f386f6..b96cb0a447 100644
--- a/test/write.cpp
+++ b/test/write.cpp
@@ -32,7 +32,7 @@ class Write : public ::testing::Test
 };
 
 // create a list of types to be tested
-typedef ::testing::Types<float, cfloat, double, cdouble, int, unsigned, char, unsigned char> TestTypes;
+typedef ::testing::Types<float, cfloat, double, cdouble, int, unsigned, char, unsigned char, short, ushort> TestTypes;
 
 // register the type list
 TYPED_TEST_CASE(Write, TestTypes);